--- sys/amd64/amd64/apic_vector.S.orig
+++ sys/amd64/amd64/apic_vector.S
@@ -2,7 +2,13 @@
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * All rights reserved.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -38,12 +44,12 @@
 
 #include "opt_smp.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
-#include "assym.s"
-
 #ifdef SMP
 #define LK	lock ;
 #else
@@ -73,30 +79,28 @@
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
-#define	ISR_VEC(index, vec_name)					\
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	cmpl	$0,x2apic_mode ;					\
-	je	1f ;							\
-	movl	$(MSR_APIC_ISR0 + index),%ecx ;				\
-	rdmsr ;								\
-	jmp	2f ;							\
-1: ;									\
-	movq	lapic_map, %rdx ;	/* pointer to local APIC */	\
-	movl	LA_ISR + 16 * (index)(%rdx), %eax ;	/* load ISR */	\
-2: ;									\
-	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
-	jz	3f ;							\
-	addl	$(32 * index),%eax ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	%eax, %edi ;	/* pass the IRQ */			\
-	call	lapic_handle_intr ;					\
-3: ;									\
-	MEXITCOUNT ;							\
+	.macro	ISR_VEC	index, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	cmpl	$0,x2apic_mode
+	je	1f
+	movl	$(MSR_APIC_ISR0 + \index),%ecx
+	rdmsr
+	jmp	2f
+1:
+	movq	lapic_map, %rdx		/* pointer to local APIC */
+	movl	LA_ISR + 16 * (\index)(%rdx), %eax	/* load ISR */
+2:
+	bsrl	%eax, %eax	/* index of highest set bit in ISR */
+	jz	3f
+	addl	$(32 * \index),%eax
+	movq	%rsp, %rsi
+	movl	%eax, %edi	/* pass the IRQ */
+	call	lapic_handle_intr
+3:
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
 /*
  * Handle "spurious INTerrupts".
@@ -108,26 +112,21 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
-
 	/* No EOI cycle used here */
-
 	jmp	doreti_iret
 
-	ISR_VEC(1, apic_isr1)
-	ISR_VEC(2, apic_isr2)
-	ISR_VEC(3, apic_isr3)
-	ISR_VEC(4, apic_isr4)
-	ISR_VEC(5, apic_isr5)
-	ISR_VEC(6, apic_isr6)
-	ISR_VEC(7, apic_isr7)
+	ISR_VEC	1, apic_isr1
+	ISR_VEC	2, apic_isr2
+	ISR_VEC	3, apic_isr3
+	ISR_VEC	4, apic_isr4
+	ISR_VEC	5, apic_isr5
+	ISR_VEC	6, apic_isr6
+	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(timerint)
-	PUSH_FRAME
+	INTR_HANDLER	timerint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	lapic_handle_timer
@@ -137,10 +136,7 @@
 /*
  * Local APIC CMCI handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cmcint)
-	PUSH_FRAME
+	INTR_HANDLER cmcint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
@@ -149,10 +145,7 @@
 /*
  * Local APIC error interrupt handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(errorint)
-	PUSH_FRAME
+	INTR_HANDLER errorint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_error
 	MEXITCOUNT
@@ -163,10 +156,7 @@
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(xen_intr_upcall)
-	PUSH_FRAME
+	INTR_HANDLER xen_intr_upcall
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	xen_intr_handle_upcall
@@ -183,59 +173,59 @@
 	SUPERALIGN_TEXT
 invltlb_ret:
 	call	as_lapic_eoi
-	POP_FRAME
-	jmp	doreti_iret
+	jmp	ld_regs
 
 	SUPERALIGN_TEXT
-IDTVEC(invltlb)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb
 	call	invltlb_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_pcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_pcid
 	call	invltlb_pcid_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_invpcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_invpcid_nopti
 	call	invltlb_invpcid_handler
 	jmp	invltlb_ret
 
+	INTR_HANDLER invltlb_invpcid_pti
+	call	invltlb_invpcid_pti_handler
+	jmp	invltlb_ret
+
 /*
  * Single page TLB shootdown
  */
-	.text
+	INTR_HANDLER invlpg
+	call	invlpg_handler
+	jmp	invltlb_ret
 
-	SUPERALIGN_TEXT
-IDTVEC(invlpg)
-	PUSH_FRAME
+	INTR_HANDLER invlpg_invpcid
+	call	invlpg_invpcid_handler
+	jmp	invltlb_ret
 
-	call	invlpg_handler
+	INTR_HANDLER invlpg_pcid
+	call	invlpg_pcid_handler
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlrng)
-	PUSH_FRAME
-
+	INTR_HANDLER invlrng
 	call	invlrng_handler
 	jmp	invltlb_ret
 
+	INTR_HANDLER invlrng_invpcid
+	call	invlrng_invpcid_handler
+	jmp	invltlb_ret
+
+	INTR_HANDLER invlrng_pcid
+	call	invlrng_pcid_handler
+	jmp	invltlb_ret
+
 /*
  * Invalidate cache.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlcache)
-	PUSH_FRAME
-
+	INTR_HANDLER invlcache
 	call	invlcache_handler
 	jmp	invltlb_ret
 
@@ -242,15 +232,9 @@
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(ipi_intr_bitmap_handler)		
-	PUSH_FRAME
-
+	INTR_HANDLER ipi_intr_bitmap_handler
 	call	as_lapic_eoi
-	
 	FAKE_MCOUNT(TF_RIP(%rsp))
-
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
@@ -258,13 +242,8 @@
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpustop)
-	PUSH_FRAME
-
+	INTR_HANDLER cpustop
 	call	as_lapic_eoi
-
 	call	cpustop_handler
 	jmp	doreti
 
@@ -271,11 +250,7 @@
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpususpend)
-	PUSH_FRAME
-
+	INTR_HANDLER cpususpend
 	call	cpususpend_handler
 	call	as_lapic_eoi
 	jmp	doreti
@@ -285,10 +260,7 @@
  *
  * - Calls the generic rendezvous action function.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(rendezvous)
-	PUSH_FRAME
+	INTR_HANDLER rendezvous
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movq	ipi_rendezvous_counts(,%rax,8), %rax
@@ -328,4 +300,8 @@
 	popq	%rax
 	jmp	doreti_iret
 
+	INTR_HANDLER	justreturn1
+	call	as_lapic_eoi
+	jmp	doreti
+
 #endif /* SMP */
--- sys/amd64/amd64/atpic_vector.S.orig
+++ sys/amd64/amd64/atpic_vector.S
@@ -36,38 +36,35 @@
  * master and slave interrupt controllers.
  */
 
+#include "assym.s"
 #include <machine/asmacros.h>
 
-#include "assym.s"
-
 /*
  * Macros for interrupt entry, call to handler, and exit.
  */
-#define	INTR(irq_num, vec_name) \
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	$irq_num, %edi; 	/* pass the IRQ */		\
-	call	atpic_handle_intr ;					\
-	MEXITCOUNT ;							\
+	.macro	INTR	irq_num, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp, %rsi
+	movl	$\irq_num, %edi	 	/* pass the IRQ */
+	call	atpic_handle_intr
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
-	INTR(0, atpic_intr0)
-	INTR(1, atpic_intr1)
-	INTR(2, atpic_intr2)
-	INTR(3, atpic_intr3)
-	INTR(4, atpic_intr4)
-	INTR(5, atpic_intr5)
-	INTR(6, atpic_intr6)
-	INTR(7, atpic_intr7)
-	INTR(8, atpic_intr8)
-	INTR(9, atpic_intr9)
-	INTR(10, atpic_intr10)
-	INTR(11, atpic_intr11)
-	INTR(12, atpic_intr12)
-	INTR(13, atpic_intr13)
-	INTR(14, atpic_intr14)
-	INTR(15, atpic_intr15)
+	INTR	0, atpic_intr0
+	INTR	1, atpic_intr1
+	INTR	2, atpic_intr2
+	INTR	3, atpic_intr3
+	INTR	4, atpic_intr4
+	INTR	5, atpic_intr5
+	INTR	6, atpic_intr6
+	INTR	7, atpic_intr7
+	INTR	8, atpic_intr8
+	INTR	9, atpic_intr9
+	INTR	10, atpic_intr10
+	INTR	11, atpic_intr11
+	INTR	12, atpic_intr12
+	INTR	13, atpic_intr13
+	INTR	14, atpic_intr14
+	INTR	15, atpic_intr15
--- sys/amd64/amd64/cpu_switch.S.orig
+++ sys/amd64/amd64/cpu_switch.S
@@ -191,9 +191,11 @@
 done_tss:
 	movq	%r8,PCPU(RSP0)
 	movq	%r8,PCPU(CURPCB)
-	/* Update the TSS_RSP0 pointer for the next interrupt */
+	/* Update the COMMON_TSS_RSP0 pointer for the next interrupt */
+	cmpb	$0,pti(%rip)
+	jne	1f
 	movq	%r8,COMMON_TSS_RSP0(%rdx)
-	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
+1:	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
 
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%r8)
@@ -270,7 +272,12 @@
 	shrq	$8,%rcx
 	movl	%ecx,8(%rax)
 	movb	$0x89,5(%rax)	/* unset busy */
-	movl	$TSSSEL,%eax
+	cmpb	$0,pti(%rip)
+	je	1f
+	movq	PCPU(PRVSPACE),%rax
+	addq	$PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax
+	movq	%rax,COMMON_TSS_RSP0(%rdx)
+1:	movl	$TSSSEL,%eax
 	ltr	%ax
 	jmp	done_tss
 
--- sys/amd64/amd64/db_trace.c.orig
+++ sys/amd64/amd64/db_trace.c
@@ -200,6 +200,7 @@
 	if (name != NULL) {
 		if (strcmp(name, "calltrap") == 0 ||
 		    strcmp(name, "fork_trampoline") == 0 ||
+		    strcmp(name, "mchk_calltrap") == 0 ||
 		    strcmp(name, "nmi_calltrap") == 0 ||
 		    strcmp(name, "Xdblfault") == 0)
 			frame_type = TRAP;
--- sys/amd64/amd64/exception.S.orig
+++ sys/amd64/amd64/exception.S
@@ -1,12 +1,16 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
- * Copyright (c) 2007 The FreeBSD Foundation
+ * Copyright (c) 2007-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -38,13 +42,13 @@
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
 
-#include "assym.s"
-
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
@@ -100,69 +104,62 @@
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
-/* Traps that we leave interrupts disabled for.. */
-#define	TRAP_NOEN(a)	\
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+/* Traps that we leave interrupts disabled for. */
+	.macro	TRAP_NOEN	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l,@function
+X\l:	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps_noen
-IDTVEC(dbg)
-	TRAP_NOEN(T_TRCTRAP)
-IDTVEC(bpt)
-	TRAP_NOEN(T_BPTFLT)
+	.endm
+
+	TRAP_NOEN	dbg, T_TRCTRAP
+	TRAP_NOEN	bpt, T_BPTFLT
 #ifdef KDTRACE_HOOKS
-IDTVEC(dtrace_ret)
-	TRAP_NOEN(T_DTRACE_RET)
+	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
 #endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
-#define	TRAP(a)	 \
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+	.macro	TRAP	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l,@function
+X\l:
+	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps
-IDTVEC(div)
-	TRAP(T_DIVIDE)
-IDTVEC(ofl)
-	TRAP(T_OFLOW)
-IDTVEC(bnd)
-	TRAP(T_BOUND)
-IDTVEC(ill)
-	TRAP(T_PRIVINFLT)
-IDTVEC(dna)
-	TRAP(T_DNA)
-IDTVEC(fpusegm)
-	TRAP(T_FPOPFLT)
-IDTVEC(mchk)
-	TRAP(T_MCHK)
-IDTVEC(rsvd)
-	TRAP(T_RESERVED)
-IDTVEC(fpu)
-	TRAP(T_ARITHTRAP)
-IDTVEC(xmm)
-	TRAP(T_XMMFLT)
+	.endm
 
-/* This group of traps have tf_err already pushed by the cpu */
-#define	TRAP_ERR(a)	\
-	subq $TF_ERR,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
+	TRAP	div, T_DIVIDE
+	TRAP	ofl, T_OFLOW
+	TRAP	bnd, T_BOUND
+	TRAP	ill, T_PRIVINFLT
+	TRAP	dna, T_DNA
+	TRAP	fpusegm, T_FPOPFLT
+	TRAP	rsvd, T_RESERVED
+	TRAP	fpu, T_ARITHTRAP
+	TRAP	xmm, T_XMMFLT
+
+/* This group of traps have tf_err already pushed by the cpu. */
+	.macro	TRAP_ERR	l, trapno
+	PTI_ENTRY	\l,X\l,has_err=1
+	.globl	X\l
+	.type	X\l,@function
+X\l:
+	subq $TF_ERR,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
 	jmp alltraps
-IDTVEC(tss)
-	TRAP_ERR(T_TSSFLT)
-IDTVEC(missing)
-	subq	$TF_ERR,%rsp
-	movl	$T_SEGNPFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(stk)
-	subq	$TF_ERR,%rsp
-	movl	$T_STKFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(align)
-	TRAP_ERR(T_ALIGNFLT)
+	.endm
 
+	TRAP_ERR	tss, T_TSSFLT
+	TRAP_ERR	align, T_ALIGNFLT
+
 	/*
 	 * alltraps entry point.  Use swapgs if this is the first time in the
 	 * kernel from userland.  Reenable interrupts if they were enabled
@@ -174,25 +171,24 @@
 alltraps:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	alltraps_testi		/* already running with kernel GS.base */
+	jz	1f		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
-alltraps_testi:
-	testl	$PSL_I,TF_RFLAGS(%rsp)
-	jz	alltraps_pushregs_no_rdi
+1:	SAVE_SEGS
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jz	2f
+	call	handle_ibrs_entry
+2:	testl	$PSL_I,TF_RFLAGS(%rsp)
+	jz	alltraps_pushregs_no_rax
 	sti
-alltraps_pushregs_no_rdi:
+alltraps_pushregs_no_rax:
 	movq	%rsi,TF_RSI(%rsp)
-	movq	%rdx,TF_RDX(%rsp)
-	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
-	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
@@ -248,15 +244,18 @@
 alltraps_noen:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f	/* already running with kernel GS.base */
+	jz	1f /* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
-	jmp	alltraps_pushregs_no_rdi
+1:	SAVE_SEGS
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jz	alltraps_pushregs_no_rax
+	call	handle_ibrs_entry
+	jmp	alltraps_pushregs_no_rax
 
 IDTVEC(dblfault)
 	subq	$TF_ERR,%rsp
@@ -278,10 +277,7 @@
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
@@ -288,31 +284,54 @@
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 1:
-	movq	%rsp,%rdi
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	2f
+	movq	%rax,%cr3
+2:	movq	%rsp,%rdi
 	call	dblfault_handler
-2:
-	hlt
-	jmp	2b
+3:	hlt
+	jmp	3b
 
+	ALIGN_TEXT
+IDTVEC(page_pti)
+	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp)
+	jz	Xpage
+	swapgs
+	pushq	%rax
+	pushq	%rdx
+	movq	%cr3,%rax
+	movq	%rax,PCPU(SAVED_UCR3)
+	PTI_UUENTRY has_err=1
+	subq	$TF_ERR,%rsp
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	jmp	page_u
 IDTVEC(page)
 	subq	$TF_ERR,%rsp
-	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
-	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
+	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f			/* already running with kernel GS.base */
+	jz	page_cr2		/* already running with kernel GS.base */
 	swapgs
-	movq	PCPU(CURPCB),%rdi
+page_u:	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
+	movq	PCPU(SAVED_UCR3),%rax
+	movq	%rax,PCB_SAVED_UCR3(%rdi)
+	call	handle_ibrs_entry
+page_cr2:
+	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
 	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
+	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
-	jz	alltraps_pushregs_no_rdi
+	jz	alltraps_pushregs_no_rax
 	sti
-	jmp	alltraps_pushregs_no_rdi
+	jmp	alltraps_pushregs_no_rax
 
 	/*
 	 * We have to special-case this one.  If we get a trap in doreti() at
@@ -319,30 +338,71 @@
 	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
 	 * to do a special the swapgs in this case even coming from the kernel.
 	 * XXX linux has a trap handler for their equivalent of load_gs().
+	 *
+	 * On the stack, we have the hardware interrupt frame to return
+	 * to usermode (faulted) and another frame with error code, for
+	 * fault.  For PTI, copy both frames to the main thread stack.
 	 */
-IDTVEC(prot)
+	.macro PROTF_ENTRY name,trapno
+\name\()_pti_doreti:
+	pushq	%rax
+	pushq	%rdx
+	swapgs
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	movq	PCPU(RSP0),%rax
+	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
+	MOVE_STACKS	(PTI_SIZE / 4 - 3)
+	movq	%rax,%rsp
+	popq	%rdx
+	popq	%rax
+	swapgs
+	jmp	X\name
+IDTVEC(\name\()_pti)
+	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
+	je	\name\()_pti_doreti
+	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
+	jz	X\name
+	PTI_UENTRY has_err=1
+	swapgs
+IDTVEC(\name)
 	subq	$TF_ERR,%rsp
-	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
+	movl	$\trapno,TF_TRAPNO(%rsp)
+	jmp	prot_addrf
+	.endm
+
+	PROTF_ENTRY	missing, T_SEGNPFLT
+	PROTF_ENTRY	stk, T_STKFLT
+	PROTF_ENTRY	prot, T_PROTFLT
+
 prot_addrf:
 	movq	$0,TF_ADDR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	movw	%fs,TF_FS(%rsp)
+	movw	%gs,TF_GS(%rsp)
 	leaq	doreti_iret(%rip),%rdi
 	cmpq	%rdi,TF_RIP(%rsp)
-	je	1f			/* kernel but with user gsbase!! */
+	je	5f			/* kernel but with user gsbase!! */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	2f			/* already running with kernel GS.base */
-1:	swapgs
-2:	movq	PCPU(CURPCB),%rdi
+	jz	6f			/* already running with kernel GS.base */
+	swapgs
+	movq	PCPU(CURPCB),%rdi
+4:	call	handle_ibrs_entry
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* always full iret from GPF */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
-	jz	alltraps_pushregs_no_rdi
+	jz	alltraps_pushregs_no_rax
 	sti
-	jmp	alltraps_pushregs_no_rdi
+	jmp	alltraps_pushregs_no_rax
 
+5:	swapgs
+6:	movq	PCPU(CURPCB),%rdi
+	jmp	4b
+
 /*
  * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
  * and the new privilige level.  We are still running on the old user stack
@@ -352,8 +412,18 @@
  * We do not support invoking this from a custom %cs or %ss (e.g. using
  * entries from an LDT).
  */
+	SUPERALIGN_TEXT
+IDTVEC(fast_syscall_pti)
+	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	jmp	fast_syscall_common
+	SUPERALIGN_TEXT
 IDTVEC(fast_syscall)
 	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+fast_syscall_common:
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	PCPU(RSP0),%rsp
 	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
@@ -363,10 +433,11 @@
 	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
 	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	movq	PCPU(SCRATCH_RAX),%rax
+	movq	%rax,TF_RAX(%rsp)	/* syscall number */
+	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
+	SAVE_SEGS
+	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%r11
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
 	sti
@@ -375,11 +446,9 @@
 	movq	$2,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
 	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
-	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
 	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
 	movq	%r8,TF_R8(%rsp)		/* arg 5 */
 	movq	%r9,TF_R9(%rsp)		/* arg 6 */
-	movq	%rax,TF_RAX(%rsp)	/* syscall number */
 	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
 	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
 	movq	%r12,TF_R12(%rsp)	/* C preserved */
@@ -398,11 +467,12 @@
 	/* Disable interrupts before testing PCB_FULL_IRET. */
 	cli
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
-	jnz	3f
+	jnz	4f
 	/* Check for and handle AST's on return to userland. */
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
-	jne	2f
+	jne	3f
+	call	handle_ibrs_exit
 	/* Restore preserved registers. */
 	MEXITCOUNT
 	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
@@ -412,16 +482,21 @@
 	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
-	swapgs
+	cmpb	$0,pti
+	je	2f
+	movq	PCPU(UCR3),%r9
+	movq	%r9,%cr3
+	xorl	%r9d,%r9d
+2:	swapgs
 	sysretq
 
-2:	/* AST scheduled. */
+3:	/* AST scheduled. */
 	sti
 	movq	%rsp,%rdi
 	call	ast
 	jmp	1b
 
-3:	/* Requested full context restore, use doreti for that. */
+4:	/* Requested full context restore, use doreti for that. */
 	MEXITCOUNT
 	jmp	doreti
 
@@ -477,10 +552,7 @@
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	xorl	%ebx,%ebx
@@ -487,7 +559,8 @@
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	nmi_fromuserspace
 	/*
-	 * We've interrupted the kernel.  Preserve GS.base in %r12.
+	 * We've interrupted the kernel.  Preserve GS.base in %r12,
+	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
@@ -499,10 +572,32 @@
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
+1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
+	je	nmi_calltrap
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	rdmsr
+	movl	%eax,%r14d
+	call	handle_ibrs_entry
 	jmp	nmi_calltrap
 nmi_fromuserspace:
 	incl	%ebx
 	swapgs
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
+1:	call	handle_ibrs_entry
+	movq	PCPU(CURPCB),%rdi
+	testq	%rdi,%rdi
+	jz	3f
+	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
+3:
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
@@ -525,14 +620,9 @@
 	movq	PCPU(CURTHREAD),%rax
 	orq	%rax,%rax	/* curthread present? */
 	jz	nocallchain
-	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
-	jz	nocallchain
 	/*
-	 * A user callchain is to be captured, so:
-	 * - Move execution to the regular kernel stack, to allow for
-	 *   nested NMI interrupts.
-	 * - Take the processor out of "NMI" mode by faking an "iret".
-	 * - Enable interrupts, so that copyin() can work.
+	 * Move execution to the regular kernel stack, because we
+	 * committed to return through doreti.
 	 */
 	movq	%rsp,%rsi	/* source stack pointer */
 	movq	$TF_SIZE,%rcx
@@ -539,12 +629,20 @@
 	movq	PCPU(RSP0),%rdx
 	subq	%rcx,%rdx
 	movq	%rdx,%rdi	/* destination stack pointer */
-
 	shrq	$3,%rcx		/* trap frame size in long words */
 	cld
 	rep
 	movsq			/* copy trapframe */
+	movq	%rdx,%rsp	/* we are on the regular kstack */
 
+	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
+	jz	nocallchain
+	/*
+	 * A user callchain is to be captured, so:
+	 * - Take the processor out of "NMI" mode by faking an "iret",
+	 *   to allow for nested NMI interrupts.
+	 * - Enable interrupts, so that copyin() can work.
+	 */
 	movl	%ss,%eax
 	pushq	%rax		/* tf_ss */
 	pushq	%rdx		/* tf_rsp (on kernel stack) */
@@ -574,33 +672,139 @@
 	cli
 nocallchain:
 #endif
-	testl	%ebx,%ebx
+	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
 	jnz	doreti_exit
-nmi_kernelexit:
 	/*
+	 * Restore speculation control MSR, if preserved.
+	 */
+	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
+	je	1f
+	movl	%r14d,%eax
+	xorl	%edx,%edx
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	wrmsr
+	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
+1:	movl	$MSR_GSBASE,%ecx
+	movq	%r12,%rdx
+	movl	%edx,%eax
+	shrq	$32,%rdx
+	wrmsr
+	movq	%r13,%cr3
+	RESTORE_REGS
+	addq	$TF_RIP,%rsp
+	jmp	doreti_iret
+
+/*
+ * MC# handling is similar to NMI.
+ *
+ * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
+ * can occur at any time with a GS.base value that does not correspond
+ * to the privilege level in CS.
+ *
+ * Machine checks are not unblocked by iretq, but it is best to run
+ * the handler with interrupts disabled since the exception may have
+ * interrupted a critical section.
+ *
+ * The MC# handler runs on its own stack (tss_ist3).  The canonical
+ * GS.base value for the processor is stored just above the bottom of
+ * its MC# stack.  For exceptions taken from kernel mode, the current
+ * value in the processor's GS.base is saved at entry to C-preserved
+ * register %r12, the canonical value for GS.base is then loaded into
+ * the processor, and the saved value is restored at exit time.  For
+ * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
+ * are used for swapping GS.base.
+ */
+
+IDTVEC(mchk)
+	subq	$TF_RIP,%rsp
+	movl	$(T_MCHK),TF_TRAPNO(%rsp)
+	movq	$0,TF_ADDR(%rsp)
+	movq	$0,TF_ERR(%rsp)
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rsi,TF_RSI(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	movq	%r8,TF_R8(%rsp)
+	movq	%r9,TF_R9(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rbx,TF_RBX(%rsp)
+	movq	%rbp,TF_RBP(%rsp)
+	movq	%r10,TF_R10(%rsp)
+	movq	%r11,TF_R11(%rsp)
+	movq	%r12,TF_R12(%rsp)
+	movq	%r13,TF_R13(%rsp)
+	movq	%r14,TF_R14(%rsp)
+	movq	%r15,TF_R15(%rsp)
+	SAVE_SEGS
+	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
+	cld
+	xorl	%ebx,%ebx
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jnz	mchk_fromuserspace
+	/*
+	 * We've interrupted the kernel.  Preserve GS.base in %r12,
+	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
+	 */
 	movl	$MSR_GSBASE,%ecx
+	rdmsr
+	movq	%rax,%r12
+	shlq	$32,%rdx
+	orq	%rdx,%r12
+	/* Retrieve and load the canonical value for GS.base. */
+	movq	TF_SIZE(%rsp),%rdx
+	movl	%edx,%eax
+	shrq	$32,%rdx
+	wrmsr
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
+1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
+	je	mchk_calltrap
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	rdmsr
+	movl	%eax,%r14d
+	call	handle_ibrs_entry
+	jmp	mchk_calltrap
+mchk_fromuserspace:
+	incl	%ebx
+	swapgs
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
+1:	call	handle_ibrs_entry
+/* Note: this label is also used by ddb and gdb: */
+mchk_calltrap:
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp,%rdi
+	call	mca_intr
+	MEXITCOUNT
+	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
+	jnz	doreti_exit
+	/*
+	 * Restore speculation control MSR, if preserved.
+	 */
+	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
+	je	1f
+	movl	%r14d,%eax
+	xorl	%edx,%edx
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	wrmsr
+	/*
+	 * Put back the preserved MSR_GSBASE value.
+	 */
+1:	movl	$MSR_GSBASE,%ecx
 	movq	%r12,%rdx
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
-nmi_restoreregs:
-	movq	TF_RDI(%rsp),%rdi
-	movq	TF_RSI(%rsp),%rsi
-	movq	TF_RDX(%rsp),%rdx
-	movq	TF_RCX(%rsp),%rcx
-	movq	TF_R8(%rsp),%r8
-	movq	TF_R9(%rsp),%r9
-	movq	TF_RAX(%rsp),%rax
-	movq	TF_RBX(%rsp),%rbx
-	movq	TF_RBP(%rsp),%rbp
-	movq	TF_R10(%rsp),%r10
-	movq	TF_R11(%rsp),%r11
-	movq	TF_R12(%rsp),%r12
-	movq	TF_R13(%rsp),%r13
-	movq	TF_R14(%rsp),%r14
-	movq	TF_R15(%rsp),%r15
+	movq	%r13,%cr3
+	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
@@ -767,27 +971,39 @@
 ld_ds:
 	movw	TF_DS(%rsp),%ds
 ld_regs:
-	movq	TF_RDI(%rsp),%rdi
-	movq	TF_RSI(%rsp),%rsi
-	movq	TF_RDX(%rsp),%rdx
-	movq	TF_RCX(%rsp),%rcx
-	movq	TF_R8(%rsp),%r8
-	movq	TF_R9(%rsp),%r9
-	movq	TF_RAX(%rsp),%rax
-	movq	TF_RBX(%rsp),%rbx
-	movq	TF_RBP(%rsp),%rbp
-	movq	TF_R10(%rsp),%r10
-	movq	TF_R11(%rsp),%r11
-	movq	TF_R12(%rsp),%r12
-	movq	TF_R13(%rsp),%r13
-	movq	TF_R14(%rsp),%r14
-	movq	TF_R15(%rsp),%r15
+	RESTORE_REGS
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f			/* keep running with kernel GS.base */
+	jz	2f			/* keep running with kernel GS.base */
 	cli
+	call	handle_ibrs_exit_rs
+	cmpb	$0,pti
+	je	1f
+	pushq	%rdx
+	movq	PCPU(PRVSPACE),%rdx
+	addq	$PC_PTI_STACK+PC_PTI_STACK_SZ*8-PTI_SIZE,%rdx
+	movq	%rax,PTI_RAX(%rdx)
+	popq	%rax
+	movq	%rax,PTI_RDX(%rdx)
+	movq	TF_RIP(%rsp),%rax
+	movq	%rax,PTI_RIP(%rdx)
+	movq	TF_CS(%rsp),%rax
+	movq	%rax,PTI_CS(%rdx)
+	movq	TF_RFLAGS(%rsp),%rax
+	movq	%rax,PTI_RFLAGS(%rdx)
+	movq	TF_RSP(%rsp),%rax
+	movq	%rax,PTI_RSP(%rdx)
+	movq	TF_SS(%rsp),%rax
+	movq	%rax,PTI_SS(%rdx)
+	movq	PCPU(UCR3),%rax
 	swapgs
-1:
-	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
+	movq	%rdx,%rsp
+	movq	%rax,%cr3
+	popq	%rdx
+	popq	%rax
+	addq	$8,%rsp
+	jmp	doreti_iret
+1:	swapgs
+2:	addq	$TF_RIP,%rsp
 	.globl	doreti_iret
 doreti_iret:
 	iretq
@@ -811,22 +1027,20 @@
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
-	testl	$PSL_I,TF_RFLAGS(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	call	handle_ibrs_entry
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
-	movq	%rdx,TF_RDX(%rsp)
-	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
-	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
@@ -845,7 +1059,7 @@
 	.globl	ds_load_fault
 ds_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	testl	$PSL_I,TF_RFLAGS(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
--- sys/amd64/amd64/genassym.c.orig
+++ sys/amd64/amd64/genassym.c
@@ -145,6 +145,7 @@
 ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
+ASSYM(PCB_SAVED_UCR3, offsetof(struct pcb, pcb_saved_ucr3));
 ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
@@ -190,6 +191,16 @@
 ASSYM(TF_SIZE, sizeof(struct trapframe));
 ASSYM(TF_HASSEGS, TF_HASSEGS);
 
+ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx));
+ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax));
+ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err));
+ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip));
+ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs));
+ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags));
+ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp));
+ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss));
+ASSYM(PTI_SIZE, sizeof(struct pti_frame));
+
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
@@ -206,6 +217,7 @@
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
+ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
 ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
@@ -215,6 +227,12 @@
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
 ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
+ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3));
+ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3));
+ASSYM(PC_SAVED_UCR3, offsetof(struct pcpu, pc_saved_ucr3));
+ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack));
+ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ);
+ASSYM(PC_IBPB_SET, offsetof(struct pcpu, pc_ibpb_set));
  
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
--- sys/amd64/amd64/initcpu.c.orig
+++ sys/amd64/amd64/initcpu.c
@@ -194,6 +194,7 @@
 		wrmsr(MSR_EFER, msr);
 		pg_nx = PG_NX;
 	}
+	hw_ibrs_recalculate();
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_AMD:
 		init_amd();
--- sys/amd64/amd64/machdep.c.orig
+++ sys/amd64/amd64/machdep.c
@@ -114,6 +114,7 @@
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
+#include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
@@ -149,6 +150,14 @@
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
+/*
+ * The PTI trampoline stack needs enough space for a hardware trapframe and a
+ * couple of scratch registers, as well as the trapframe left behind after an
+ * iret fault.
+ */
+CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
+    offsetof(struct pti_frame, pti_rip));
+
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
@@ -180,12 +189,6 @@
 	.msi_init =			msi_init,
 };
 
-/*
- * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its value is
- * the physical address at which the kernel is loaded.
- */
-extern char kernphys[];
-
 struct msgbuf *msgbufp;
 
 /*
@@ -670,7 +673,7 @@
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
-
+static char mce0_stack[PAGE_SIZE] __aligned(16);
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
@@ -824,13 +827,20 @@
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
+	IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
+	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
+	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
+	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
+	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
+	IDTVEC(xmm_pti),
 #ifdef KDTRACE_HOOKS
-	IDTVEC(dtrace_ret),
+	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
 #endif
 #ifdef XENHVM
-	IDTVEC(xen_intr_upcall),
+	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
 #endif
-	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
+	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
+	IDTVEC(fast_syscall_pti);
 
 #ifdef DDB
 /*
@@ -1523,6 +1533,23 @@
 #endif
 }
 
+/* Set up the fast syscall stuff */
+void
+amd64_conf_fast_syscall(void)
+{
+	uint64_t msr;
+
+	msr = rdmsr(MSR_EFER) | EFER_SCE;
+	wrmsr(MSR_EFER, msr);
+	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
+	    (u_int64_t)IDTVEC(fast_syscall));
+	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
+	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
+	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
+	wrmsr(MSR_STAR, msr);
+	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
+}
+
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
@@ -1531,7 +1558,7 @@
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct xstate_hdr *xhdr;
-	u_int64_t msr;
+	u_int64_t rsp0;
 	char *env;
 	size_t kstack0_sz;
 	int late_console;
@@ -1544,6 +1571,8 @@
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
+	identify_cpu1();
+
 	/* Init basic tunables, hz etc */
 	init_param1();
 
@@ -1600,34 +1629,55 @@
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
+	pti = pti_get_default();
+	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
+
 	for (x = 0; x < NIDT; x++)
-		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
+		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
+		    SEL_KPL, 0);
+	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT,
+	    SEL_KPL, 0);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
- 	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
-	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
+	    SEL_UPL, 0);
+	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
+	    SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
-	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
+	    SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
+	    SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
+	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
+	    SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
-	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
+	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
+	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
-	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
+	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
+	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 #endif
-
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
@@ -1648,7 +1698,7 @@
 	    != NULL)
 		vty_set_preferred(VTY_VT);
 
-	identify_cpu();		/* Final stage of CPU initialization */
+	finishidentcpu();	/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 	initializecpucache();
 
@@ -1663,6 +1713,14 @@
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist2 = (long) np;
 
+	/*
+	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
+	 * above the start of the ist3 stack.
+	 */
+	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
+	np->np_pcpu = (register_t) pc;
+	common_tss[0].tss_ist3 = (long) np;
+	
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 
@@ -1669,15 +1727,7 @@
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
-	/* Set up the fast syscall stuff */
-	msr = rdmsr(MSR_EFER) | EFER_SCE;
-	wrmsr(MSR_EFER, msr);
-	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
-	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
-	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
-	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
-	wrmsr(MSR_STAR, msr);
-	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
+	amd64_conf_fast_syscall();
 
 	/*
 	 * Temporary forge some valid pointer to PCB, for exception
@@ -1749,10 +1799,12 @@
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
-	common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
+	rsp0 = (vm_offset_t)thread0.td_pcb;
 	/* Ensure the stack is aligned to 16 bytes */
-	common_tss[0].tss_rsp0 &= ~0xFul;
-	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
+	rsp0 &= ~0xFul;
+	common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) +
+	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0;
+	PCPU_SET(rsp0, rsp0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/* transfer to user mode */
@@ -1782,6 +1834,8 @@
 #endif
 	thread0.td_critnest = 0;
 
+	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
+
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
--- sys/amd64/amd64/mp_machdep.c.orig
+++ sys/amd64/amd64/mp_machdep.c
@@ -85,10 +85,9 @@
 
 /* Temporary variables for init_secondary()  */
 char *doublefault_stack;
+char *mce_stack;
 char *nmi_stack;
 
-extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
-
 /*
  * Local data and functions.
  */
@@ -132,33 +131,50 @@
 	/* Install an inter-CPU IPI for TLB invalidation */
 	if (pmap_pcid_enabled) {
 		if (invpcid_works) {
-			setidt(IPI_INVLTLB, IDTVEC(invltlb_invpcid),
-			    SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLTLB, pti ?
+			    IDTVEC(invltlb_invpcid_pti_pti) :
+			    IDTVEC(invltlb_invpcid_nopti), SDT_SYSIGT,
+			    SEL_KPL, 0);
+			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_invpcid_pti) :
+			    IDTVEC(invlpg_invpcid), SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_invpcid_pti) :
+			    IDTVEC(invlrng_invpcid), SDT_SYSIGT, SEL_KPL, 0);
 		} else {
-			setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT,
-			    SEL_KPL, 0);
+			setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) :
+			    IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pcid_pti) :
+			    IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pcid_pti) :
+			    IDTVEC(invlrng_pcid), SDT_SYSIGT, SEL_KPL, 0);
 		}
 	} else {
-		setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
+		setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb),
+		    SDT_SYSIGT, SEL_KPL, 0);
+		setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
+		    SDT_SYSIGT, SEL_KPL, 0);
+		setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
+		    SDT_SYSIGT, SEL_KPL, 0);
 	}
-	setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for cache invalidation. */
-	setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache),
+	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
-	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) :
+	    IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install generic inter-CPU IPI handler */
-	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
-	       SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) :
+	    IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
-	setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
+	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
-	setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
+	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
@@ -188,7 +204,7 @@
 {
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
-	u_int64_t msr, cr0;
+	u_int64_t cr0;
 	int cpu, gsel_tss, x;
 	struct region_descriptor ap_gdt;
 
@@ -197,7 +213,6 @@
 
 	/* Init tss */
 	common_tss[cpu] = common_tss[0];
-	common_tss[cpu].tss_rsp0 = 0;   /* not used until after switch */
 	common_tss[cpu].tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 	common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
@@ -206,6 +221,10 @@
 	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
 	common_tss[cpu].tss_ist2 = (long) np;
 
+	/* The MC# stack runs on IST3. */
+	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
+	common_tss[cpu].tss_ist3 = (long) np;
+
 	/* Prepare private GDT */
 	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
 	for (x = 0; x < NGDT; x++) {
@@ -240,10 +259,17 @@
 	pc->pc_curpmap = kernel_pmap;
 	pc->pc_pcid_gen = 1;
 	pc->pc_pcid_next = PMAP_PCID_KERN + 1;
+	common_tss[cpu].tss_rsp0 = pti ? ((vm_offset_t)&pc->pc_pti_stack +
+	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0;
 
 	/* Save the per-cpu pointer for use by the NMI handler. */
+	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t) pc;
 
+	/* Save the per-cpu pointer for use by the MC# handler. */
+	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
+	np->np_pcpu = (register_t) pc;
+
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
@@ -263,15 +289,7 @@
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 
-	/* Set up the fast syscall stuff */
-	msr = rdmsr(MSR_EFER) | EFER_SCE;
-	wrmsr(MSR_EFER, msr);
-	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
-	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
-	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
-	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
-	wrmsr(MSR_STAR, msr);
-	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
+	amd64_conf_fast_syscall();
 
 	/* signal our startup to the BSP. */
 	mp_naps++;
@@ -346,6 +364,8 @@
 		    kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO);
 		doublefault_stack = (char *)kmem_malloc(kernel_arena,
 		    PAGE_SIZE, M_WAITOK | M_ZERO);
+		mce_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
+		    M_WAITOK | M_ZERO);
 		nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,
@@ -428,9 +448,43 @@
 }
 
 void
+invltlb_invpcid_pti_handler(void)
+{
+	struct invpcid_descr d;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;
+	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+	d.pad = 0;
+	d.addr = 0;
+	if (smp_tlb_pmap == kernel_pmap) {
+		/*
+		 * This invalidation actually needs to clear kernel
+		 * mappings from the TLB in the current pmap, but
+		 * since we were asked for the flush in the kernel
+		 * pmap, achieve it by performing global flush.
+		 */
+		invpcid(&d, INVPCID_CTXGLOB);
+	} else {
+		invpcid(&d, INVPCID_CTX);
+		d.pcid |= PMAP_PCID_USER_PT;
+		invpcid(&d, INVPCID_CTX);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
 invltlb_pcid_handler(void)
 {
-	uint32_t generation;
+	uint64_t kcr3, ucr3;
+	uint32_t generation, pcid;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
@@ -451,9 +505,132 @@
 		 * CPU.
 		 */
 		if (PCPU_GET(curpmap) == smp_tlb_pmap) {
-			load_cr3(smp_tlb_pmap->pm_cr3 |
-			    smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
+			pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+			kcr3 = smp_tlb_pmap->pm_cr3 | pcid;
+			ucr3 = smp_tlb_pmap->pm_ucr3;
+			if (ucr3 != PMAP_NO_CR3) {
+				ucr3 |= PMAP_PCID_USER_PT | pcid;
+				pmap_pti_pcid_invalidate(ucr3, kcr3);
+			} else
+				load_cr3(kcr3);
 		}
 	}
 	PCPU_SET(smp_tlb_done, generation);
 }
+
+void
+invlpg_invpcid_handler(void)
+{
+	struct invpcid_descr d;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	invlpg(smp_tlb_addr1);
+	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
+		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
+		    PMAP_PCID_USER_PT;
+		d.pad = 0;
+		d.addr = smp_tlb_addr1;
+		invpcid(&d, INVPCID_ADDR);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_pcid_handler(void)
+{
+	uint64_t kcr3, ucr3;
+	uint32_t generation;
+	uint32_t pcid;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	invlpg(smp_tlb_addr1);
+	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
+	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
+		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+		pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_invpcid_handler(void)
+{
+	struct invpcid_descr d;
+	vm_offset_t addr, addr2;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
+		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
+		    PMAP_PCID_USER_PT;
+		d.pad = 0;
+		d.addr = smp_tlb_addr1;
+		do {
+			invpcid(&d, INVPCID_ADDR);
+			d.addr += PAGE_SIZE;
+		} while (d.addr < addr2);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_pcid_handler(void)
+{
+	vm_offset_t addr, addr2;
+	uint64_t kcr3, ucr3;
+	uint32_t generation;
+	uint32_t pcid;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
+	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
+		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
+		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+		pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2);
+	}
+	PCPU_SET(smp_tlb_done, generation);
+}
--- sys/amd64/amd64/pmap.c.orig
+++ sys/amd64/amd64/pmap.c
@@ -9,11 +9,17 @@
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -147,6 +153,7 @@
 #ifdef SMP
 #include <machine/smp.h>
 #endif
+#include <machine/tss.h>
 
 static __inline boolean_t
 pmap_type_guest(pmap_t pmap)
@@ -208,6 +215,8 @@
 	return (mask);
 }
 
+static pt_entry_t pg_g;
+
 static __inline pt_entry_t
 pmap_global_bit(pmap_t pmap)
 {
@@ -215,7 +224,7 @@
 
 	switch (pmap->pm_type) {
 	case PT_X86:
-		mask = X86_PG_G;
+		mask = pg_g;
 		break;
 	case PT_RVI:
 	case PT_EPT:
@@ -405,6 +414,15 @@
 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
     "Is the invpcid instruction available ?");
 
+int pti = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+    &pti, 0,
+    "Page Table Isolation enabled");
+static vm_object_t pti_obj;
+static pml4_entry_t *pti_pml4;
+static vm_pindex_t pti_pg_idx;
+static bool pti_finalized;
+
 static int
 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 {
@@ -622,6 +640,11 @@
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
+static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
+    bool exec);
+static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
+static pd_entry_t *pmap_pti_pde(vm_offset_t va);
+static void pmap_pti_wire_pte(void *pte);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
@@ -901,7 +924,7 @@
 	/* XXX not fully used, underneath 2M pages */
 	pt_p = (pt_entry_t *)KPTphys;
 	for (i = 0; ptoa(i) < *firstaddr; i++)
-		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
+		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g;
 
 	/* Now map the page tables at their location within PTmap */
 	pd_p = (pd_entry_t *)KPDphys;
@@ -912,7 +935,7 @@
 	/* This replaces some of the KPTphys entries above */
 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
 		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
-		    X86_PG_G;
+		    pg_g;
 
 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
@@ -932,7 +955,7 @@
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A;
 	}
 	pdp_p = (pdp_entry_t *)DMPDPphys;
@@ -939,7 +962,7 @@
 	for (i = 0; i < ndm1g; i++) {
 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
@@ -982,6 +1005,9 @@
 	pt_entry_t *pte;
 	int i;
 
+	if (!pti)
+		pg_g = X86_PG_G;
+
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
@@ -1014,6 +1040,7 @@
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	kernel_pmap->pm_cr3 = KPML4phys;
+	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	kernel_pmap->pm_flags = pmap_flags;
@@ -1528,6 +1555,9 @@
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t *mask;
+	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (pmap_type_guest(pmap)) {
@@ -1544,9 +1574,32 @@
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
-		if (pmap == PCPU_GET(curpmap))
+		if (pmap == PCPU_GET(curpmap)) {
 			invlpg(va);
-		else if (pmap_pcid_enabled)
+			if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
+				/*
+				 * Disable context switching. pm_pcid
+				 * is recalculated on switch, which
+				 * might make us use wrong pcid below.
+				 */
+				critical_enter();
+				pcid = pmap->pm_pcids[cpuid].pm_pcid;
+
+				if (invpcid_works) {
+					d.pcid = pcid | PMAP_PCID_USER_PT;
+					d.pad = 0;
+					d.addr = va;
+					invpcid(&d, INVPCID_ADDR);
+				} else {
+					kcr3 = pmap->pm_cr3 | pcid |
+					    CR3_PCID_SAVE;
+					ucr3 = pmap->pm_ucr3 | pcid |
+					    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+					pmap_pti_pcid_invlpg(ucr3, kcr3, va);
+				}
+				critical_exit();
+			}
+		} else if (pmap_pcid_enabled)
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		if (pmap_pcid_enabled) {
 			CPU_FOREACH(i) {
@@ -1556,7 +1609,7 @@
 		}
 		mask = &pmap->pm_active;
 	}
-	smp_masked_invlpg(*mask, va);
+	smp_masked_invlpg(*mask, va, pmap);
 	sched_unpin();
 }
 
@@ -1567,7 +1620,10 @@
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t *mask;
+	struct invpcid_descr d;
 	vm_offset_t addr;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
@@ -1593,6 +1649,26 @@
 		if (pmap == PCPU_GET(curpmap)) {
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
+			if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
+				critical_enter();
+				pcid = pmap->pm_pcids[cpuid].pm_pcid;
+				if (invpcid_works) {
+					d.pcid = pcid | PMAP_PCID_USER_PT;
+					d.pad = 0;
+					d.addr = sva;
+					for (; d.addr < eva; d.addr +=
+					    PAGE_SIZE)
+						invpcid(&d, INVPCID_ADDR);
+				} else {
+					kcr3 = pmap->pm_cr3 | pcid |
+					    CR3_PCID_SAVE;
+					ucr3 = pmap->pm_ucr3 | pcid |
+					    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+					pmap_pti_pcid_invlrng(ucr3, kcr3, sva,
+					    eva);
+				}
+				critical_exit();
+			}
 		} else if (pmap_pcid_enabled) {
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		}
@@ -1604,7 +1680,7 @@
 		}
 		mask = &pmap->pm_active;
 	}
-	smp_masked_invlpg_range(*mask, sva, eva);
+	smp_masked_invlpg_range(*mask, sva, eva, pmap);
 	sched_unpin();
 }
 
@@ -1613,6 +1689,8 @@
 {
 	cpuset_t *mask;
 	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (pmap_type_guest(pmap)) {
@@ -1636,15 +1714,29 @@
 		cpuid = PCPU_GET(cpuid);
 		if (pmap == PCPU_GET(curpmap)) {
 			if (pmap_pcid_enabled) {
+				critical_enter();
+				pcid = pmap->pm_pcids[cpuid].pm_pcid;
 				if (invpcid_works) {
-					d.pcid = pmap->pm_pcids[cpuid].pm_pcid;
+					d.pcid = pcid;
 					d.pad = 0;
 					d.addr = 0;
 					invpcid(&d, INVPCID_CTX);
+					if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+						d.pcid |= PMAP_PCID_USER_PT;
+						invpcid(&d, INVPCID_CTX);
+					}
 				} else {
-					load_cr3(pmap->pm_cr3 | pmap->pm_pcids
-					    [PCPU_GET(cpuid)].pm_pcid);
+					kcr3 = pmap->pm_cr3 | pcid;
+					ucr3 = pmap->pm_ucr3;
+					if (ucr3 != PMAP_NO_CR3) {
+						ucr3 |= pcid | PMAP_PCID_USER_PT;
+						pmap_pti_pcid_invalidate(ucr3,
+						    kcr3);
+					} else {
+						load_cr3(kcr3);
+					}
 				}
+				critical_exit();
 			} else {
 				invltlb();
 			}
@@ -1749,6 +1841,9 @@
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
+	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
+	uint32_t pcid;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
@@ -1757,9 +1852,26 @@
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
-	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
+	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		invlpg(va);
-	else if (pmap_pcid_enabled)
+		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
+		    pmap->pm_ucr3 != PMAP_NO_CR3) {
+			critical_enter();
+			pcid = pmap->pm_pcids[0].pm_pcid;
+			if (invpcid_works) {
+				d.pcid = pcid | PMAP_PCID_USER_PT;
+				d.pad = 0;
+				d.addr = va;
+				invpcid(&d, INVPCID_ADDR);
+			} else {
+				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
+				ucr3 = pmap->pm_ucr3 | pcid |
+				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
+			}
+			critical_exit();
+		}
+	} else if (pmap_pcid_enabled)
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 
@@ -1766,7 +1878,9 @@
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+	struct invpcid_descr d;
 	vm_offset_t addr;
+	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
@@ -1778,6 +1892,25 @@
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
+		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
+		    pmap->pm_ucr3 != PMAP_NO_CR3) {
+			critical_enter();
+			if (invpcid_works) {
+				d.pcid = pmap->pm_pcids[0].pm_pcid |
+				    PMAP_PCID_USER_PT;
+				d.pad = 0;
+				d.addr = sva;
+				for (; d.addr < eva; d.addr += PAGE_SIZE)
+					invpcid(&d, INVPCID_ADDR);
+			} else {
+				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
+				    pm_pcid | CR3_PCID_SAVE;
+				ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
+				    pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
+				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
+			}
+			critical_exit();
+		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
@@ -1787,6 +1920,7 @@
 pmap_invalidate_all(pmap_t pmap)
 {
 	struct invpcid_descr d;
+	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
@@ -1804,15 +1938,26 @@
 		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		if (pmap_pcid_enabled) {
+			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
+				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+					d.pcid |= PMAP_PCID_USER_PT;
+					invpcid(&d, INVPCID_CTX);
+				}
 			} else {
-				load_cr3(pmap->pm_cr3 | pmap->pm_pcids[0].
-				    pm_pcid);
+				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
+				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+					ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
+					    0].pm_pcid | PMAP_PCID_USER_PT;
+					pmap_pti_pcid_invalidate(ucr3, kcr3);
+				} else
+					load_cr3(kcr3);
 			}
+			critical_exit();
 		} else {
 			invltlb();
 		}
@@ -2094,7 +2239,7 @@
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
 }
 
 static __inline void
@@ -2105,7 +2250,7 @@
 
 	pte = vtopte(va);
 	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
-	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
 }
 
 /*
@@ -2165,7 +2310,7 @@
 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
-			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
+			pte_store(pte, pa | pg_g | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
@@ -2284,6 +2429,10 @@
 		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		*pml4 = 0;
+		if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
+			pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
+			*pml4 = 0;
+		}
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp_entry_t *pdp;
@@ -2349,7 +2498,10 @@
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
+	pmap->pm_pml4u = NULL;
 	pmap->pm_cr3 = KPML4phys;
+	/* hack to keep pmap_pti_pcid_invalidate() alive */
+	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
@@ -2358,6 +2510,8 @@
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
+		if (!pti)
+			__pcpu[i].pc_kcr3 = PMAP_NO_CR3;
 	}
 	PCPU_SET(curpmap, kernel_pmap);
 	pmap_activate(curthread);
@@ -2387,6 +2541,17 @@
 	    X86_PG_A | X86_PG_M;
 }
 
+static void
+pmap_pinit_pml4_pti(vm_page_t pml4pg)
+{
+	pml4_entry_t *pm_pml4;
+	int i;
+
+	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+	for (i = 0; i < NPML4EPG; i++)
+		pm_pml4[i] = pti_pml4[i];
+}
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
@@ -2394,7 +2559,7 @@
 int
 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
-	vm_page_t pml4pg;
+	vm_page_t pml4pg, pml4pgu;
 	vm_paddr_t pml4phys;
 	int i;
 
@@ -2411,8 +2576,11 @@
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
-	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
+	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
+	pmap->pm_ucr3 = PMAP_NO_CR3;
+	pmap->pm_pml4u = NULL;
 
+	pmap->pm_type = pm_type;
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
@@ -2420,10 +2588,21 @@
 	 * Do not install the host kernel mappings in the nested page
 	 * tables. These mappings are meaningless in the guest physical
 	 * address space.
+	 * Install minimal kernel mappings in PTI case.
 	 */
-	if ((pmap->pm_type = pm_type) == PT_X86) {
+	if (pm_type == PT_X86) {
 		pmap->pm_cr3 = pml4phys;
 		pmap_pinit_pml4(pml4pg);
+		if (pti) {
+			while ((pml4pgu = vm_page_alloc(NULL, 0,
+			    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED))
+			    == NULL)
+			       VM_WAIT;
+			pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
+			    VM_PAGE_TO_PHYS(pml4pgu));
+			pmap_pinit_pml4_pti(pml4pgu);
+			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
+		}
 	}
 
 	pmap->pm_root.rt_root = 0;
@@ -2495,7 +2674,7 @@
 	 */
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
-		pml4_entry_t *pml4;
+		pml4_entry_t *pml4, *pml4u;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
@@ -2502,7 +2681,21 @@
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+		if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
+			/*
+			 * PTI: Make all user-space mappings in the
+			 * kernel-mode page table no-execute so that
+			 * we detect any programming errors that leave
+			 * the kernel-mode page table active on return
+			 * to user space.
+			 */
+			*pml4 |= pg_nx;
 
+			pml4u = &pmap->pm_pml4u[pml4index];
+			*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
+			    PG_A | PG_M;
+		}
+
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
@@ -2702,6 +2895,13 @@
 	m->wire_count--;
 	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 	vm_page_free_zero(m);
+
+	if (pmap->pm_pml4u != NULL) {
+		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
+		m->wire_count--;
+		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
+		vm_page_free(m);
+	}
 }
 
 static int
@@ -6867,13 +7067,15 @@
 
 	CRITICAL_ASSERT(curthread);
 	gen = PCPU_GET(pcid_gen);
-	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
-	    pmap->pm_pcids[cpuid].pm_gen == gen)
+	if (!pti && (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN ||
+	    pmap->pm_pcids[cpuid].pm_gen == gen))
 		return (CR3_PCID_SAVE);
 	pcid_next = PCPU_GET(pcid_next);
-	KASSERT(pcid_next <= PMAP_PCID_OVERMAX, ("cpu %d pcid_next %#x",
-	    cpuid, pcid_next));
-	if (pcid_next == PMAP_PCID_OVERMAX) {
+	KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
+	    (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
+	    ("cpu %d pcid_next %#x", cpuid, pcid_next));
+	if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
+	    (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
 		new_gen = gen + 1;
 		if (new_gen == 0)
 			new_gen = 1;
@@ -6892,7 +7094,8 @@
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
-	uint64_t cached, cr3;
+	struct invpcid_descr d;
+	uint64_t cached, cr3, kcr3, ucr3;
 	register_t rflags;
 	u_int cpuid;
 
@@ -6948,11 +7151,41 @@
 				PCPU_INC(pm_save_cnt);
 		}
 		PCPU_SET(curpmap, pmap);
+		if (pti) {
+			kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
+			ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
+			    PMAP_PCID_USER_PT;
+
+			/*
+			 * Manually invalidate translations cached
+			 * from the user page table, which are not
+			 * flushed by reload of cr3 with the kernel
+			 * page table pointer above.
+			 */
+			if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+				if (invpcid_works) {
+					d.pcid = PMAP_PCID_USER_PT |
+					    pmap->pm_pcids[cpuid].pm_pcid;
+					d.pad = 0;
+					d.addr = 0;
+					invpcid(&d, INVPCID_CTX);
+				} else {
+					pmap_pti_pcid_invalidate(ucr3, kcr3);
+				}
+			}
+
+			PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
+			PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
+		}
 		if (!invpcid_works)
 			intr_restore(rflags);
 	} else if (cr3 != pmap->pm_cr3) {
 		load_cr3(pmap->pm_cr3);
 		PCPU_SET(curpmap, pmap);
+		if (pti) {
+			PCPU_SET(kcr3, pmap->pm_cr3);
+			PCPU_SET(ucr3, pmap->pm_ucr3);
+		}
 	}
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
@@ -7271,6 +7504,291 @@
 	mtx_unlock_spin(&qframe_mtx);
 }
 
+static vm_page_t
+pmap_pti_alloc_page(void)
+{
+	vm_page_t m;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
+	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
+	return (m);
+}
+
+static bool
+pmap_pti_free_page(vm_page_t m)
+{
+
+	KASSERT(m->wire_count > 0, ("page %p not wired", m));
+	m->wire_count--;
+	if (m->wire_count != 0)
+		return (false);
+	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
+	vm_page_free_zero(m);
+	return (true);
+}
+
+static void
+pmap_pti_init(void)
+{
+	vm_page_t pml4_pg;
+	pdp_entry_t *pdpe;
+	vm_offset_t va;
+	int i;
+
+	if (!pti)
+		return;
+	pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
+	VM_OBJECT_WLOCK(pti_obj);
+	pml4_pg = pmap_pti_alloc_page();
+	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
+	for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
+	    va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
+		pdpe = pmap_pti_pdpe(va);
+		pmap_pti_wire_pte(pdpe);
+	}
+	pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
+	    (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
+	pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
+	    sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
+	pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
+	    sizeof(struct gate_descriptor) * NIDT, false);
+	pmap_pti_add_kva_locked((vm_offset_t)common_tss,
+	    (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
+	CPU_FOREACH(i) {
+		/* Doublefault stack IST 1 */
+		va = common_tss[i].tss_ist1;
+		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
+		/* NMI stack IST 2 */
+		va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
+		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
+		/* MC# stack IST 3 */
+		va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
+		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
+	}
+	pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
+	    (vm_offset_t)etext, true);
+	pti_finalized = true;
+	VM_OBJECT_WUNLOCK(pti_obj);
+}
+SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
+
+static pdp_entry_t *
+pmap_pti_pdpe(vm_offset_t va)
+{
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	vm_page_t m;
+	vm_pindex_t pml4_idx;
+	vm_paddr_t mphys;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	pml4_idx = pmap_pml4e_index(va);
+	pml4e = &pti_pml4[pml4_idx];
+	m = NULL;
+	if (*pml4e == 0) {
+		if (pti_finalized)
+			panic("pml4 alloc after finalization\n");
+		m = pmap_pti_alloc_page();
+		if (*pml4e != 0) {
+			pmap_pti_free_page(m);
+			mphys = *pml4e & ~PAGE_MASK;
+		} else {
+			mphys = VM_PAGE_TO_PHYS(m);
+			*pml4e = mphys | X86_PG_RW | X86_PG_V;
+		}
+	} else {
+		mphys = *pml4e & ~PAGE_MASK;
+	}
+	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
+	return (pdpe);
+}
+
+static void
+pmap_pti_wire_pte(void *pte)
+{
+	vm_page_t m;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
+	m->wire_count++;
+}
+
+static void
+pmap_pti_unwire_pde(void *pde, bool only_ref)
+{
+	vm_page_t m;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
+	MPASS(m->wire_count > 0);
+	MPASS(only_ref || m->wire_count > 1);
+	pmap_pti_free_page(m);
+}
+
+static void
+pmap_pti_unwire_pte(void *pte, vm_offset_t va)
+{
+	vm_page_t m;
+	pd_entry_t *pde;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
+	MPASS(m->wire_count > 0);
+	if (pmap_pti_free_page(m)) {
+		pde = pmap_pti_pde(va);
+		MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
+		*pde = 0;
+		pmap_pti_unwire_pde(pde, false);
+	}
+}
+
+static pd_entry_t *
+pmap_pti_pde(vm_offset_t va)
+{
+	pdp_entry_t *pdpe;
+	pd_entry_t *pde;
+	vm_page_t m;
+	vm_pindex_t pd_idx;
+	vm_paddr_t mphys;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	pdpe = pmap_pti_pdpe(va);
+	if (*pdpe == 0) {
+		m = pmap_pti_alloc_page();
+		if (*pdpe != 0) {
+			pmap_pti_free_page(m);
+			MPASS((*pdpe & X86_PG_PS) == 0);
+			mphys = *pdpe & ~PAGE_MASK;
+		} else {
+			mphys =  VM_PAGE_TO_PHYS(m);
+			*pdpe = mphys | X86_PG_RW | X86_PG_V;
+		}
+	} else {
+		MPASS((*pdpe & X86_PG_PS) == 0);
+		mphys = *pdpe & ~PAGE_MASK;
+	}
+
+	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
+	pd_idx = pmap_pde_index(va);
+	pde += pd_idx;
+	return (pde);
+}
+
+static pt_entry_t *
+pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
+{
+	pd_entry_t *pde;
+	pt_entry_t *pte;
+	vm_page_t m;
+	vm_paddr_t mphys;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	pde = pmap_pti_pde(va);
+	if (unwire_pde != NULL) {
+		*unwire_pde = true;
+		pmap_pti_wire_pte(pde);
+	}
+	if (*pde == 0) {
+		m = pmap_pti_alloc_page();
+		if (*pde != 0) {
+			pmap_pti_free_page(m);
+			MPASS((*pde & X86_PG_PS) == 0);
+			mphys = *pde & ~(PAGE_MASK | pg_nx);
+		} else {
+			mphys = VM_PAGE_TO_PHYS(m);
+			*pde = mphys | X86_PG_RW | X86_PG_V;
+			if (unwire_pde != NULL)
+				*unwire_pde = false;
+		}
+	} else {
+		MPASS((*pde & X86_PG_PS) == 0);
+		mphys = *pde & ~(PAGE_MASK | pg_nx);
+	}
+
+	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
+	pte += pmap_pte_index(va);
+
+	return (pte);
+}
+
+static void
+pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
+{
+	vm_paddr_t pa;
+	pd_entry_t *pde;
+	pt_entry_t *pte, ptev;
+	bool unwire_pde;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	sva = trunc_page(sva);
+	MPASS(sva > VM_MAXUSER_ADDRESS);
+	eva = round_page(eva);
+	MPASS(sva < eva);
+	for (; sva < eva; sva += PAGE_SIZE) {
+		pte = pmap_pti_pte(sva, &unwire_pde);
+		pa = pmap_kextract(sva);
+		ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A |
+		    (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
+		    VM_MEMATTR_DEFAULT, FALSE);
+		if (*pte == 0) {
+			pte_store(pte, ptev);
+			pmap_pti_wire_pte(pte);
+		} else {
+			KASSERT(!pti_finalized,
+			    ("pti overlap after fin %#lx %#lx %#lx",
+			    sva, *pte, ptev));
+			KASSERT(*pte == ptev,
+			    ("pti non-identical pte after fin %#lx %#lx %#lx",
+			    sva, *pte, ptev));
+		}
+		if (unwire_pde) {
+			pde = pmap_pti_pde(sva);
+			pmap_pti_unwire_pde(pde, true);
+		}
+	}
+}
+
+void
+pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
+{
+
+	if (!pti)
+		return;
+	VM_OBJECT_WLOCK(pti_obj);
+	pmap_pti_add_kva_locked(sva, eva, exec);
+	VM_OBJECT_WUNLOCK(pti_obj);
+}
+
+void
+pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
+{
+	pt_entry_t *pte;
+	vm_offset_t va;
+
+	if (!pti)
+		return;
+	sva = rounddown2(sva, PAGE_SIZE);
+	MPASS(sva > VM_MAXUSER_ADDRESS);
+	eva = roundup2(eva, PAGE_SIZE);
+	MPASS(sva < eva);
+	VM_OBJECT_WLOCK(pti_obj);
+	for (va = sva; va < eva; va += PAGE_SIZE) {
+		pte = pmap_pti_pte(va, NULL);
+		KASSERT((*pte & X86_PG_V) != 0,
+		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
+		    (u_long)pte, *pte));
+		pte_clear(pte);
+		pmap_pti_unwire_pte(pte, va);
+	}
+	pmap_invalidate_range(kernel_pmap, sva, eva);
+	VM_OBJECT_WUNLOCK(pti_obj);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
--- sys/amd64/amd64/support.S.orig
+++ sys/amd64/amd64/support.S
@@ -33,6 +33,7 @@
 #include "opt_ddb.h"
 
 #include <machine/asmacros.h>
+#include <machine/specialreg.h>
 #include <machine/pmap.h>
 
 #include "assym.s"
@@ -787,3 +788,115 @@
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret
+
+/*
+ * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
+ * Invalidates address space addressed by ucr3, then returns to kcr3.
+ * Done in assembler to ensure no other memory accesses happen while
+ * on ucr3.
+ */
+	ALIGN_TEXT
+ENTRY(pmap_pti_pcid_invalidate)
+	pushfq
+	cli
+	movq	%rdi,%cr3	/* to user page table */
+	movq	%rsi,%cr3	/* back to kernel */
+	popfq
+	retq
+
+/*
+ * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
+ * Invalidates virtual address va in address space ucr3, then returns to kcr3.
+ */
+	ALIGN_TEXT
+ENTRY(pmap_pti_pcid_invlpg)
+	pushfq
+	cli
+	movq	%rdi,%cr3	/* to user page table */
+	invlpg	(%rdx)
+	movq	%rsi,%cr3	/* back to kernel */
+	popfq
+	retq
+
+/*
+ * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
+ *     vm_offset_t eva);
+ * Invalidates virtual addresses between sva and eva in address space ucr3,
+ * then returns to kcr3.
+ */
+	ALIGN_TEXT
+ENTRY(pmap_pti_pcid_invlrng)
+	pushfq
+	cli
+	movq	%rdi,%cr3	/* to user page table */
+1:	invlpg	(%rdx)
+	addq	$PAGE_SIZE,%rdx
+	cmpq	%rdx,%rcx
+	ja	1b
+	movq	%rsi,%cr3	/* back to kernel */
+	popfq
+	retq
+
+	.altmacro
+	.macro	ibrs_seq_label l
+handle_ibrs_\l:
+	.endm
+	.macro	ibrs_call_label l
+	call	handle_ibrs_\l
+	.endm
+	.macro	ibrs_seq count
+	ll=1
+	.rept	\count
+	ibrs_call_label	%(ll)
+	nop
+	ibrs_seq_label %(ll)
+	addq	$8,%rsp
+	ll=ll+1
+	.endr
+	.endm
+
+/* all callers already saved %rax, %rdx, and %rcx */
+ENTRY(handle_ibrs_entry)
+	cmpb	$0,hw_ibrs_active(%rip)
+	je	1f
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	movl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
+	movl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
+	wrmsr
+	movb	$1,PCPU(IBPB_SET)
+	testl	$CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip)
+	jne	1f
+	ibrs_seq 32
+1:	ret
+END(handle_ibrs_entry)
+
+ENTRY(handle_ibrs_exit)
+	cmpb	$0,PCPU(IBPB_SET)
+	je	1f
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	wrmsr
+	movb	$0,PCPU(IBPB_SET)
+1:	ret
+END(handle_ibrs_exit)
+
+/* registers-neutral version, but needs stack */
+ENTRY(handle_ibrs_exit_rs)
+	cmpb	$0,PCPU(IBPB_SET)
+	je	1f
+	pushq	%rax
+	pushq	%rdx
+	pushq	%rcx
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	xorl	%eax,%eax
+	xorl	%edx,%edx
+	wrmsr
+	popq	%rcx
+	popq	%rdx
+	popq	%rax
+	movb	$0,PCPU(IBPB_SET)
+1:	ret
+END(handle_ibrs_exit_rs)
+
+	.noaltmacro
--- sys/amd64/amd64/sys_machdep.c.orig
+++ sys/amd64/amd64/sys_machdep.c
@@ -357,7 +357,9 @@
 	pcb = td->td_pcb;
 	if (pcb->pcb_tssp == NULL) {
 		tssp = (struct amd64tss *)kmem_malloc(kernel_arena,
-		    ctob(IOPAGES+1), M_WAITOK);
+		    ctob(IOPAGES + 1), M_WAITOK);
+		pmap_pti_add_kva((vm_offset_t)tssp, (vm_offset_t)tssp +
+		    ctob(IOPAGES + 1), false);
 		iomap = (char *)&tssp[1];
 		memset(iomap, 0xff, IOPERM_BITMAP_SIZE);
 		critical_enter();
@@ -452,6 +454,8 @@
 	struct proc_ldt *pldt, *new_ldt;
 	struct mdproc *mdp;
 	struct soft_segment_descriptor sldt;
+	vm_offset_t sva;
+	vm_size_t sz;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	mdp = &p->p_md;
@@ -459,13 +463,13 @@
 		return (mdp->md_ldt);
 	mtx_unlock(&dt_lock);
 	new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
-	new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
-	     max_ldt_segment * sizeof(struct user_segment_descriptor),
-	     M_WAITOK | M_ZERO);
+	sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
+	sva = kmem_malloc(kernel_arena, sz, M_WAITOK | M_ZERO);
+	new_ldt->ldt_base = (caddr_t)sva;
+	pmap_pti_add_kva(sva, sva + sz, false);
 	new_ldt->ldt_refcnt = 1;
-	sldt.ssd_base = (uint64_t)new_ldt->ldt_base;
-	sldt.ssd_limit = max_ldt_segment *
-	    sizeof(struct user_segment_descriptor) - 1;
+	sldt.ssd_base = sva;
+	sldt.ssd_limit = sz - 1;
 	sldt.ssd_type = SDT_SYSLDT;
 	sldt.ssd_dpl = SEL_KPL;
 	sldt.ssd_p = 1;
@@ -475,8 +479,8 @@
 	mtx_lock(&dt_lock);
 	pldt = mdp->md_ldt;
 	if (pldt != NULL && !force) {
-		kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base,
-		    max_ldt_segment * sizeof(struct user_segment_descriptor));
+		pmap_pti_remove_kva(sva, sva + sz);
+		kmem_free(kernel_arena, sva, sz);
 		free(new_ldt, M_SUBPROC);
 		return (pldt);
 	}
@@ -518,10 +522,14 @@
 static void
 user_ldt_derefl(struct proc_ldt *pldt)
 {
+	vm_offset_t sva;
+	vm_size_t sz;
 
 	if (--pldt->ldt_refcnt == 0) {
-		kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
-		    max_ldt_segment * sizeof(struct user_segment_descriptor));
+		sva = (vm_offset_t)pldt->ldt_base;
+		sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
+		pmap_pti_remove_kva(sva, sva + sz);
+		kmem_free(kernel_arena, sva, sz);
 		free(pldt, M_SUBPROC);
 	}
 }
--- sys/amd64/amd64/trap.c.orig
+++ sys/amd64/amd64/trap.c
@@ -218,11 +218,6 @@
 #endif
 	}
 
-	if (type == T_MCHK) {
-		mca_intr();
-		goto out;
-	}
-
 	if ((frame->tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
@@ -452,9 +447,28 @@
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
+			 *
+			 * In case of PTI, the IRETQ faulted while the
+			 * kernel used the pti stack, and exception
+			 * frame records %rsp value pointing to that
+			 * stack.  If we return normally to
+			 * doreti_iret_fault, the trapframe is
+			 * reconstructed on pti stack, and calltrap()
+			 * called on it as well.  Due to the very
+			 * limited pti stack size, kernel does not
+			 * survive for too long.  Switch to the normal
+			 * thread stack for the trap handling.
+			 *
+			 * Magic '5' is the number of qwords occupied by
+			 * the hardware trap frame.
 			 */
 			if (frame->tf_rip == (long)doreti_iret) {
 				frame->tf_rip = (long)doreti_iret_fault;
+				if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR(
+				    pti_stack) + (PC_PTI_STACK_SZ - 5) *
+				    sizeof(register_t))
+					frame->tf_rsp = PCPU_GET(rsp0) - 5 *
+					    sizeof(register_t);
 				goto out;
 			}
 			if (frame->tf_rip == (long)ld_ds) {
@@ -694,6 +708,17 @@
 	}
 
 	/*
+	 * If nx protection of the usermode portion of kernel page
+	 * tables caused trap, panic.
+	 */
+	if (pti && usermode && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W |
+	    PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) &&
+	    (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)==
+	    (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK))
+		panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid,
+		    p->p_comm, frame->tf_err);
+
+	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
--- sys/amd64/amd64/vm_machdep.c.orig
+++ sys/amd64/amd64/vm_machdep.c
@@ -339,6 +339,8 @@
 	 * Clean TSS/iomap
 	 */
 	if (pcb->pcb_tssp != NULL) {
+		pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp,
+		    (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1));
 		kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp,
 		    ctob(IOPAGES + 1));
 		pcb->pcb_tssp = NULL;
--- sys/amd64/ia32/ia32_exception.S.orig
+++ sys/amd64/ia32/ia32_exception.S
@@ -40,24 +40,27 @@
  * that it originated in supervisor mode and skip the swapgs.
  */
 	SUPERALIGN_TEXT
+IDTVEC(int0x80_syscall_pti)
+	PTI_UENTRY has_err=0
+	jmp	int0x80_syscall_common
+	SUPERALIGN_TEXT
 IDTVEC(int0x80_syscall)
 	swapgs
+int0x80_syscall_common:
 	pushq	$2			/* sizeof "int 0x80" */
 	subq	$TF_ERR,%rsp		/* skip over tf_trapno */
 	movq	%rdi,TF_RDI(%rsp)
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	call	handle_ibrs_entry
 	sti
 	movq	%rsi,TF_RSI(%rsp)
-	movq	%rdx,TF_RDX(%rsp)
-	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
-	movq	%rax,TF_RAX(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
 	movq	%rbp,TF_RBP(%rsp)
 	movq	%r10,TF_R10(%rsp)
--- sys/amd64/ia32/ia32_syscall.c.orig
+++ sys/amd64/ia32/ia32_syscall.c
@@ -93,7 +93,8 @@
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
-extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd);
+extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti),
+    IDTVEC(rsvd), IDTVEC(rsvd_pti);
 
 void ia32_syscall(struct trapframe *frame);	/* Called from asm code */
 
@@ -205,7 +206,8 @@
 ia32_syscall_enable(void *dummy)
 {
 
- 	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
+ 	setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) :
+	    &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
 }
 
 static void
@@ -212,7 +214,8 @@
 ia32_syscall_disable(void *dummy)
 {
 
- 	setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+ 	setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd),
+	    SDT_SYSIGT, SEL_KPL, 0);
 }
 
 SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL);
--- sys/amd64/include/asmacros.h.orig
+++ sys/amd64/include/asmacros.h
@@ -1,7 +1,15 @@
+/* -*- mode: asm -*- */
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -144,70 +152,135 @@
 
 #ifdef LOCORE
 /*
+ * Access per-CPU data.
+ */
+#define	PCPU(member)	%gs:PC_ ## member
+#define	PCPU_ADDR(member, reg)					\
+	movq %gs:PC_PRVSPACE, reg ;				\
+	addq $PC_ ## member, reg
+
+/*
  * Convenience macro for declaring interrupt entry points.
  */
 #define	IDTVEC(name)	ALIGN_TEXT; .globl __CONCAT(X,name); \
 			.type __CONCAT(X,name),@function; __CONCAT(X,name):
 
-/*
- * Macros to create and destroy a trap frame.
- */
-#define PUSH_FRAME							\
-	subq	$TF_RIP,%rsp ;	/* skip dummy tf_err and tf_trapno */	\
-	testb	$SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */	\
-	jz	1f ;		/* Yes, dont swapgs again */		\
-	swapgs ;							\
-1:	movq	%rdi,TF_RDI(%rsp) ;					\
-	movq	%rsi,TF_RSI(%rsp) ;					\
-	movq	%rdx,TF_RDX(%rsp) ;					\
-	movq	%rcx,TF_RCX(%rsp) ;					\
-	movq	%r8,TF_R8(%rsp) ;					\
-	movq	%r9,TF_R9(%rsp) ;					\
-	movq	%rax,TF_RAX(%rsp) ;					\
-	movq	%rbx,TF_RBX(%rsp) ;					\
-	movq	%rbp,TF_RBP(%rsp) ;					\
-	movq	%r10,TF_R10(%rsp) ;					\
-	movq	%r11,TF_R11(%rsp) ;					\
-	movq	%r12,TF_R12(%rsp) ;					\
-	movq	%r13,TF_R13(%rsp) ;					\
-	movq	%r14,TF_R14(%rsp) ;					\
-	movq	%r15,TF_R15(%rsp) ;					\
-	movw	%fs,TF_FS(%rsp) ;					\
-	movw	%gs,TF_GS(%rsp) ;					\
-	movw	%es,TF_ES(%rsp) ;					\
-	movw	%ds,TF_DS(%rsp) ;					\
-	movl	$TF_HASSEGS,TF_FLAGS(%rsp) ;				\
+	.macro	SAVE_SEGS
+	movw	%fs,TF_FS(%rsp)
+	movw	%gs,TF_GS(%rsp)
+	movw	%es,TF_ES(%rsp)
+	movw	%ds,TF_DS(%rsp)
+	.endm
+
+	.macro	MOVE_STACKS qw
+	.L.offset=0
+	.rept	\qw
+	movq	.L.offset(%rsp),%rdx
+	movq	%rdx,.L.offset(%rax)
+	.L.offset=.L.offset+8
+	.endr
+	.endm
+
+	.macro	PTI_UUENTRY has_err
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	movq	PCPU(RSP0),%rax
+	subq	$PTI_SIZE,%rax
+	MOVE_STACKS	((PTI_SIZE / 8) - 1 + \has_err)
+	movq	%rax,%rsp
+	popq	%rdx
+	popq	%rax
+	.endm
+
+	.macro	PTI_UENTRY has_err
+	swapgs
+	pushq	%rax
+	pushq	%rdx
+	PTI_UUENTRY \has_err
+	.endm
+
+	.macro	PTI_ENTRY name, cont, has_err=0
+	ALIGN_TEXT
+	.globl	X\name\()_pti
+	.type	X\name\()_pti,@function
+X\name\()_pti:
+	/* %rax, %rdx and possibly err not yet pushed */
+	testb	$SEL_RPL_MASK,PTI_CS-(2+1-\has_err)*8(%rsp)
+	jz	\cont
+	PTI_UENTRY \has_err
+	swapgs
+	jmp	\cont
+	.endm
+
+	.macro	PTI_INTRENTRY vec_name
+	SUPERALIGN_TEXT
+	.globl	X\vec_name\()_pti
+	.type	X\vec_name\()_pti,@function
+X\vec_name\()_pti:
+	testb	$SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* err, %rax, %rdx not pushed */
+	jz	\vec_name\()_u
+	PTI_UENTRY has_err=0
+	jmp	\vec_name\()_u
+	.endm
+
+	.macro	INTR_PUSH_FRAME vec_name
+	SUPERALIGN_TEXT
+	.globl	X\vec_name
+	.type	X\vec_name,@function
+X\vec_name:
+	testb	$SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* come from kernel? */
+	jz	\vec_name\()_u		/* Yes, dont swapgs again */
+	swapgs
+\vec_name\()_u:
+	subq	$TF_RIP,%rsp	/* skip dummy tf_err and tf_trapno */
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rsi,TF_RSI(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	movq	%r8,TF_R8(%rsp)
+	movq	%r9,TF_R9(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rbx,TF_RBX(%rsp)
+	movq	%rbp,TF_RBP(%rsp)
+	movq	%r10,TF_R10(%rsp)
+	movq	%r11,TF_R11(%rsp)
+	movq	%r12,TF_R12(%rsp)
+	movq	%r13,TF_R13(%rsp)
+	movq	%r14,TF_R14(%rsp)
+	movq	%r15,TF_R15(%rsp)
+	SAVE_SEGS
+	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)  /* come from kernel ? */
+	jz	1f		/* yes, leave PCB_FULL_IRET alone */
+	movq	PCPU(CURPCB),%r8
+	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
+1:
+	.endm
 
-#define POP_FRAME							\
-	movq	TF_RDI(%rsp),%rdi ;					\
-	movq	TF_RSI(%rsp),%rsi ;					\
-	movq	TF_RDX(%rsp),%rdx ;					\
-	movq	TF_RCX(%rsp),%rcx ;					\
-	movq	TF_R8(%rsp),%r8 ;					\
-	movq	TF_R9(%rsp),%r9 ;					\
-	movq	TF_RAX(%rsp),%rax ;					\
-	movq	TF_RBX(%rsp),%rbx ;					\
-	movq	TF_RBP(%rsp),%rbp ;					\
-	movq	TF_R10(%rsp),%r10 ;					\
-	movq	TF_R11(%rsp),%r11 ;					\
-	movq	TF_R12(%rsp),%r12 ;					\
-	movq	TF_R13(%rsp),%r13 ;					\
-	movq	TF_R14(%rsp),%r14 ;					\
-	movq	TF_R15(%rsp),%r15 ;					\
-	testb	$SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */	\
-	jz	1f ;		/* keep kernel GS.base */		\
-	cli ;								\
-	swapgs ;							\
-1:	addq	$TF_RIP,%rsp	/* skip over tf_err, tf_trapno */
+	.macro	INTR_HANDLER vec_name
+	.text
+	PTI_INTRENTRY	\vec_name
+	INTR_PUSH_FRAME	\vec_name
+	.endm
 
-/*
- * Access per-CPU data.
- */
-#define	PCPU(member)	%gs:PC_ ## member
-#define	PCPU_ADDR(member, reg)					\
-	movq %gs:PC_PRVSPACE, reg ;				\
-	addq $PC_ ## member, reg
+	.macro	RESTORE_REGS
+	movq	TF_RDI(%rsp),%rdi
+	movq	TF_RSI(%rsp),%rsi
+	movq	TF_RDX(%rsp),%rdx
+	movq	TF_RCX(%rsp),%rcx
+	movq	TF_R8(%rsp),%r8
+	movq	TF_R9(%rsp),%r9
+	movq	TF_RAX(%rsp),%rax
+	movq	TF_RBX(%rsp),%rbx
+	movq	TF_RBP(%rsp),%rbp
+	movq	TF_R10(%rsp),%r10
+	movq	TF_R11(%rsp),%r11
+	movq	TF_R12(%rsp),%r12
+	movq	TF_R13(%rsp),%r13
+	movq	TF_R14(%rsp),%r14
+	movq	TF_R15(%rsp),%r15
+	.endm
 
 #endif /* LOCORE */
 
--- sys/amd64/include/frame.h.orig
+++ sys/amd64/include/frame.h
@@ -1,6 +1,50 @@
 /*-
- * This file is in the public domain.
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
  */
-/* $FreeBSD: releng/11.1/sys/amd64/include/frame.h 247047 2013-02-20 17:39:52Z kib $ */
 
+#ifndef	_AMD64_FRAME_H
+#define	_AMD64_FRAME_H
+
 #include <x86/frame.h>
+
+struct pti_frame {
+	register_t	pti_rdx;
+	register_t	pti_rax;
+	register_t	pti_err;
+	register_t	pti_rip;
+	register_t	pti_cs;
+	register_t	pti_rflags;
+	register_t	pti_rsp;
+	register_t	pti_ss;
+};
+
+#endif
--- sys/amd64/include/intr_machdep.h.orig
+++ sys/amd64/include/intr_machdep.h
@@ -136,7 +136,7 @@
 
 /*
  * The following data structure holds per-cpu data, and is placed just
- * above the top of the space used for the NMI stack.
+ * above the top of the space used for the NMI and MC# stacks.
  */
 struct nmi_pcpu {
 	register_t	np_pcpu;
--- sys/amd64/include/md_var.h.orig
+++ sys/amd64/include/md_var.h
@@ -35,9 +35,17 @@
 #include <x86/x86_var.h>
 
 extern  uint64_t *vm_page_dump;
+extern int	hw_ibrs_disable;
 
+/*
+ * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its
+ * value is the physical address at which the kernel is loaded.
+ */
+extern char kernphys[];
+
 struct	savefpu;
 
+void	amd64_conf_fast_syscall(void);
 void	amd64_db_resume_dbreg(void);
 void	amd64_syscall(struct thread *td, int traced);
 void	doreti_iret(void) __asm(__STRING(doreti_iret));
--- sys/amd64/include/pcb.h.orig
+++ sys/amd64/include/pcb.h
@@ -90,7 +90,7 @@
 	/* copyin/out fault recovery */
 	caddr_t		pcb_onfault;
 
-	uint64_t	pcb_pad0;
+	uint64_t	pcb_saved_ucr3;
 
 	/* local tss, with i/o bitmap; NULL for common */
 	struct amd64tss *pcb_tssp;
--- sys/amd64/include/pcpu.h.orig
+++ sys/amd64/include/pcpu.h
@@ -33,6 +33,7 @@
 #error "sys/cdefs.h is a prerequisite for this file"
 #endif
 
+#define	PC_PTI_STACK_SZ	16
 /*
  * The SMP parts are setup in pmap.c and locore.s for the BSP, and
  * mp_machdep.c sets up the data for the AP's to "see" when they awake.
@@ -46,8 +47,12 @@
 	struct	pmap *pc_curpmap;					\
 	struct	amd64tss *pc_tssp;	/* TSS segment active on CPU */	\
 	struct	amd64tss *pc_commontssp;/* Common TSS for the CPU */	\
+	uint64_t pc_kcr3;						\
+	uint64_t pc_ucr3;						\
+	uint64_t pc_saved_ucr3;						\
 	register_t pc_rsp0;						\
 	register_t pc_scratch_rsp;	/* User %rsp in syscall */	\
+	register_t pc_scratch_rax;					\
 	u_int	pc_apic_id;						\
 	u_int   pc_acpi_id;		/* ACPI CPU id */		\
 	/* Pointer to the CPU %fs descriptor */				\
@@ -61,12 +66,14 @@
 	uint64_t	pc_pm_save_cnt;					\
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	uint64_t pc_dbreg[16];		/* ddb debugging regs */	\
+	uint64_t pc_pti_stack[PC_PTI_STACK_SZ];				\
 	int pc_dbreg_cmd;		/* ddb debugging reg cmd */	\
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	uint32_t pc_pcid_next;						\
 	uint32_t pc_pcid_gen;						\
 	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
-	char	__pad[145]		/* be divisor of PAGE_SIZE	\
+	uint32_t pc_ibpb_set;						\
+	char	__pad[96]		/* be divisor of PAGE_SIZE	\
 					   after cache alignment */
 
 #define	PC_DBREG_CMD_NONE	0
--- sys/amd64/include/pmap.h.orig
+++ sys/amd64/include/pmap.h
@@ -223,7 +223,11 @@
 #define	PMAP_PCID_NONE		0xffffffff
 #define	PMAP_PCID_KERN		0
 #define	PMAP_PCID_OVERMAX	0x1000
+#define	PMAP_PCID_OVERMAX_KERN	0x800
+#define	PMAP_PCID_USER_PT	0x800
 
+#define	PMAP_NO_CR3		(~0UL)
+
 #ifndef LOCORE
 
 #include <sys/queue.h>
@@ -313,7 +317,9 @@
 struct pmap {
 	struct mtx		pm_mtx;
 	pml4_entry_t		*pm_pml4;	/* KVA of level 4 page table */
+	pml4_entry_t		*pm_pml4u;	/* KVA of user l4 page table */
 	uint64_t		pm_cr3;
+	uint64_t		pm_ucr3;
 	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
 	cpuset_t		pm_active;	/* active on cpus */
 	enum pmap_type		pm_type;	/* regular or nested tables */
@@ -419,6 +425,12 @@
 void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
 boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
 void	pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+void	pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec);
+void	pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva);
+void	pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
+void	pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
+void	pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
+	    vm_offset_t eva);
 #endif /* _KERNEL */
 
 /* Return various clipped indexes for a given VA */
--- sys/amd64/include/smp.h.orig
+++ sys/amd64/include/smp.h
@@ -28,12 +28,36 @@
 
 /* IPI handlers */
 inthand_t
+	IDTVEC(justreturn),	/* interrupt CPU with minimum overhead */
+	IDTVEC(justreturn1_pti),
+	IDTVEC(invltlb_pti),
+	IDTVEC(invltlb_pcid_pti),
 	IDTVEC(invltlb_pcid),	/* TLB shootdowns - global, pcid */
-	IDTVEC(invltlb_invpcid),/* TLB shootdowns - global, invpcid */
-	IDTVEC(justreturn);	/* interrupt CPU with minimum overhead */
+	IDTVEC(invltlb_invpcid_pti_pti),
+	IDTVEC(invltlb_invpcid_nopti),
+	IDTVEC(invlpg_pti),
+	IDTVEC(invlpg_invpcid_pti),
+	IDTVEC(invlpg_invpcid),
+	IDTVEC(invlpg_pcid_pti),
+	IDTVEC(invlpg_pcid),
+	IDTVEC(invlrng_pti),
+	IDTVEC(invlrng_invpcid_pti),
+	IDTVEC(invlrng_invpcid),
+	IDTVEC(invlrng_pcid_pti),
+	IDTVEC(invlrng_pcid),
+	IDTVEC(invlcache_pti),
+	IDTVEC(ipi_intr_bitmap_handler_pti),
+	IDTVEC(cpustop_pti),
+	IDTVEC(cpususpend_pti),
+	IDTVEC(rendezvous_pti);
 
 void	invltlb_pcid_handler(void);
 void	invltlb_invpcid_handler(void);
+void	invltlb_invpcid_pti_handler(void);
+void	invlpg_invpcid_handler(void);
+void	invlpg_pcid_handler(void);
+void	invlrng_invpcid_handler(void);
+void	invlrng_pcid_handler(void);
 int	native_start_all_aps(void);
 
 #endif /* !LOCORE */
--- sys/amd64/vmm/intel/vmx.c.orig
+++ sys/amd64/vmm/intel/vmx.c
@@ -693,7 +693,8 @@
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
-			pirvec = lapic_ipi_alloc(&IDTVEC(justreturn));
+			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+			    &IDTVEC(justreturn));
 			if (pirvec < 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
--- sys/amd64/vmm/vmm.c.orig
+++ sys/amd64/vmm/vmm.c
@@ -55,6 +55,7 @@
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
+#include <machine/md_var.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 
@@ -325,7 +326,8 @@
 
 	vmm_host_state_init();
 
-	vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn));
+	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+	    &IDTVEC(justreturn));
 	if (vmm_ipinum < 0)
 		vmm_ipinum = IPI_AST;
 
--- sys/conf/Makefile.amd64.orig
+++ sys/conf/Makefile.amd64
@@ -39,6 +39,7 @@
 
 ASM_CFLAGS.acpi_wakecode.S=	${CLANG_NO_IAS34}
 ASM_CFLAGS.mpboot.S=		${CLANG_NO_IAS34}
+ASM_CFLAGS.support.S=		${CLANG_NO_IAS}
 
 %BEFORE_DEPEND
 
--- sys/dev/cpuctl/cpuctl.c.orig
+++ sys/dev/cpuctl/cpuctl.c
@@ -71,6 +71,7 @@
     struct thread *td);
 static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data,
     struct thread *td);
+static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td);
 static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data,
     struct thread *td);
 static int update_intel(int cpu, cpuctl_update_args_t *args,
@@ -157,7 +158,8 @@
 	}
 	/* Require write flag for "write" requests. */
 	if ((cmd == CPUCTL_MSRCBIT || cmd == CPUCTL_MSRSBIT ||
-	    cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR) &&
+	    cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR ||
+	    cmd == CPUCTL_EVAL_CPU_FEATURES) &&
 	    (flags & FWRITE) == 0)
 		return (EPERM);
 	switch (cmd) {
@@ -185,6 +187,9 @@
 		ret = cpuctl_do_cpuid_count(cpu,
 		    (cpuctl_cpuid_count_args_t *)data, td);
 		break;
+	case CPUCTL_EVAL_CPU_FEATURES:
+		ret = cpuctl_do_eval_cpu_features(cpu, td);
+		break;
 	default:
 		ret = EINVAL;
 		break;
@@ -502,6 +507,30 @@
 	return (ret);
 }
 
+static int
+cpuctl_do_eval_cpu_features(int cpu, struct thread *td)
+{
+	int is_bound = 0;
+	int oldcpu;
+
+	KASSERT(cpu >= 0 && cpu <= mp_maxid,
+	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
+
+#ifdef __i386__
+	if (cpu_id == 0)
+		return (ENODEV);
+#endif
+	oldcpu = td->td_oncpu;
+	is_bound = cpu_sched_is_bound(td);
+	set_cpu(cpu, td);
+	identify_cpu1();
+	identify_cpu2();
+	hw_ibrs_recalculate();
+	restore_cpu(oldcpu, is_bound, td);
+	printcpuinfo();
+	return (0);
+}
+
 int
 cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
 {
--- sys/dev/hyperv/vmbus/amd64/vmbus_vector.S.orig
+++ sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
@@ -26,11 +26,11 @@
  * $FreeBSD$
  */
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 
-#include "assym.s"
-
 /*
  * This is the Hyper-V vmbus channel direct callback interrupt.
  * Only used when it is running on Hyper-V.
@@ -37,8 +37,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
-IDTVEC(vmbus_isr)
-	PUSH_FRAME
+	INTR_HANDLER	vmbus_isr
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	vmbus_handle_intr
--- sys/dev/hyperv/vmbus/i386/vmbus_vector.S.orig
+++ sys/dev/hyperv/vmbus/i386/vmbus_vector.S
@@ -37,6 +37,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
+IDTVEC(vmbus_isr_pti)
 IDTVEC(vmbus_isr)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
--- sys/dev/hyperv/vmbus/vmbus.c.orig
+++ sys/dev/hyperv/vmbus/vmbus.c
@@ -46,6 +46,7 @@
 
 #include <machine/bus.h>
 #include <machine/intr_machdep.h>
+#include <machine/md_var.h>
 #include <machine/resource.h>
 #include <x86/include/apicvar.h>
 
@@ -128,7 +129,7 @@
 
 static struct vmbus_softc	*vmbus_sc;
 
-extern inthand_t IDTVEC(vmbus_isr);
+extern inthand_t IDTVEC(vmbus_isr), IDTVEC(vmbus_isr_pti);
 
 static const uint32_t		vmbus_version[] = {
 	VMBUS_VERSION_WIN8_1,
@@ -928,7 +929,8 @@
 	 * All Hyper-V ISR required resources are setup, now let's find a
 	 * free IDT vector for Hyper-V ISR and set it up.
 	 */
-	sc->vmbus_idtvec = lapic_ipi_alloc(IDTVEC(vmbus_isr));
+	sc->vmbus_idtvec = lapic_ipi_alloc(pti ? IDTVEC(vmbus_isr_pti) :
+	    IDTVEC(vmbus_isr));
 	if (sc->vmbus_idtvec < 0) {
 		device_printf(sc->vmbus_dev, "cannot find free IDT vector\n");
 		return ENXIO;
--- sys/i386/i386/apic_vector.s.orig
+++ sys/i386/i386/apic_vector.s
@@ -70,6 +70,7 @@
 #define	ISR_VEC(index, vec_name)					\
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
+IDTVEC(vec_name ## _pti) ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	SET_KERNEL_SREGS ;						\
@@ -123,6 +124,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
+IDTVEC(timerint_pti)
 IDTVEC(timerint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
@@ -139,6 +141,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
+IDTVEC(cmcint_pti)
 IDTVEC(cmcint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
@@ -153,6 +156,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
+IDTVEC(errorint_pti)
 IDTVEC(errorint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
--- sys/i386/i386/atpic_vector.s.orig
+++ sys/i386/i386/atpic_vector.s
@@ -46,6 +46,7 @@
 #define	INTR(irq_num, vec_name) \
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
+IDTVEC(vec_name ##_pti) ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	SET_KERNEL_SREGS ;						\
--- sys/i386/i386/exception.s.orig
+++ sys/i386/i386/exception.s
@@ -133,6 +133,7 @@
 	TRAP(T_PAGEFLT)
 IDTVEC(mchk)
 	pushl $0; TRAP(T_MCHK)
+IDTVEC(rsvd_pti)
 IDTVEC(rsvd)
 	pushl $0; TRAP(T_RESERVED)
 IDTVEC(fpu)
--- sys/i386/i386/machdep.c.orig
+++ sys/i386/i386/machdep.c
@@ -2577,7 +2577,7 @@
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 #ifdef XENHVM
-	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL,
+	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 #endif
 
--- sys/i386/i386/pmap.c.orig
+++ sys/i386/i386/pmap.c
@@ -283,6 +283,8 @@
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
+int pti;
+
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
@@ -1043,7 +1045,7 @@
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
-	smp_masked_invlpg(*mask, va);
+	smp_masked_invlpg(*mask, va, pmap);
 	sched_unpin();
 }
 
@@ -1077,7 +1079,7 @@
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
-	smp_masked_invlpg_range(*mask, sva, eva);
+	smp_masked_invlpg_range(*mask, sva, eva, pmap);
 	sched_unpin();
 }
 
--- sys/i386/i386/support.s.orig
+++ sys/i386/i386/support.s
@@ -830,3 +830,11 @@
 	movl	$0,PCB_ONFAULT(%ecx)
 	movl	$EFAULT,%eax
 	ret
+
+ENTRY(handle_ibrs_entry)
+	ret
+END(handle_ibrs_entry)
+
+ENTRY(handle_ibrs_exit)
+	ret
+END(handle_ibrs_exit)
--- sys/i386/i386/vm_machdep.c.orig
+++ sys/i386/i386/vm_machdep.c
@@ -795,7 +795,7 @@
 		CPU_NAND(&other_cpus, &sf->cpumask);
 		if (!CPU_EMPTY(&other_cpus)) {
 			CPU_OR(&sf->cpumask, &other_cpus);
-			smp_masked_invlpg(other_cpus, sf->kva);
+			smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap);
 		}
 	}
 	sched_unpin();
--- sys/sys/cpuctl.h.orig
+++ sys/sys/cpuctl.h
@@ -57,5 +57,6 @@
 #define	CPUCTL_MSRSBIT	_IOWR('c', 5, cpuctl_msr_args_t)
 #define	CPUCTL_MSRCBIT	_IOWR('c', 6, cpuctl_msr_args_t)
 #define	CPUCTL_CPUID_COUNT _IOWR('c', 7, cpuctl_cpuid_count_args_t)
+#define	CPUCTL_EVAL_CPU_FEATURES	_IO('c', 8)
 
 #endif /* _CPUCTL_H_ */
--- sys/x86/include/apicvar.h.orig
+++ sys/x86/include/apicvar.h
@@ -179,7 +179,11 @@
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
 	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
-	IDTVEC(spuriousint), IDTVEC(timerint);
+	IDTVEC(spuriousint), IDTVEC(timerint),
+	IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti),
+	IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti),
+	IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti),
+	IDTVEC(spuriousint_pti), IDTVEC(timerint_pti);
 
 extern vm_paddr_t lapic_paddr;
 extern int apic_cpuids[];
--- sys/x86/include/specialreg.h.orig
+++ sys/x86/include/specialreg.h
@@ -374,6 +374,17 @@
 #define	CPUID_STDEXT2_SGXLC	0x40000000
 
 /*
+ * CPUID instruction 7 Structured Extended Features, leaf 0 edx info
+ */
+#define	CPUID_STDEXT3_IBPB	0x04000000
+#define	CPUID_STDEXT3_STIBP	0x08000000
+#define	CPUID_STDEXT3_ARCH_CAP	0x20000000
+
+/* MSR IA32_ARCH_CAP(ABILITIES) bits */
+#define	IA32_ARCH_CAP_RDCL_NO	0x00000001
+#define	IA32_ARCH_CAP_IBRS_ALL	0x00000002
+
+/*
  * CPUID manufacturers identifiers
  */
 #define	AMD_VENDOR_ID		"AuthenticAMD"
@@ -401,6 +412,8 @@
 #define	MSR_EBL_CR_POWERON	0x02a
 #define	MSR_TEST_CTL		0x033
 #define	MSR_IA32_FEATURE_CONTROL 0x03a
+#define	MSR_IA32_SPEC_CTRL	0x048
+#define	MSR_IA32_PRED_CMD	0x049
 #define	MSR_BIOS_UPDT_TRIG	0x079
 #define	MSR_BBL_CR_D0		0x088
 #define	MSR_BBL_CR_D1		0x089
@@ -413,6 +426,7 @@
 #define	MSR_APERF		0x0e8
 #define	MSR_IA32_EXT_CONFIG	0x0ee	/* Undocumented. Core Solo/Duo only */
 #define	MSR_MTRRcap		0x0fe
+#define	MSR_IA32_ARCH_CAP	0x10a
 #define	MSR_BBL_CR_ADDR		0x116
 #define	MSR_BBL_CR_DECC		0x118
 #define	MSR_BBL_CR_CTL		0x119
@@ -556,6 +570,17 @@
 #define	IA32_MISC_EN_XDD	0x0000000400000000ULL
 
 /*
+ * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel'
+ * document 336996-001 Speculative Execution Side Channel Mitigations.
+ */
+/* MSR IA32_SPEC_CTRL */
+#define	IA32_SPEC_CTRL_IBRS	0x00000001
+#define	IA32_SPEC_CTRL_STIBP	0x00000002
+
+/* MSR IA32_PRED_CMD */
+#define	IA32_PRED_CMD_IBPB_BARRIER	0x0000000000000001ULL
+
+/*
  * PAT modes.
  */
 #define	PAT_UNCACHEABLE		0x00
--- sys/x86/include/x86_smp.h.orig
+++ sys/x86/include/x86_smp.h
@@ -37,6 +37,7 @@
 extern int cpu_cores;
 extern volatile uint32_t smp_tlb_generation;
 extern struct pmap *smp_tlb_pmap;
+extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 extern u_int xhits_gbl[];
 extern u_int xhits_pg[];
 extern u_int xhits_rng[];
@@ -95,9 +96,9 @@
 u_int	mp_bootaddress(u_int);
 void	set_interrupt_apic_ids(void);
 void	smp_cache_flush(void);
-void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr);
+void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap);
 void	smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
-	    vm_offset_t endva);
+	    vm_offset_t endva, struct pmap *pmap);
 void	smp_masked_invltlb(cpuset_t mask, struct pmap *pmap);
 void	mem_range_AP_init(void);
 void	topo_probe(void);
--- sys/x86/include/x86_var.h.orig
+++ sys/x86/include/x86_var.h
@@ -50,6 +50,8 @@
 extern	u_int	cpu_clflush_line_size;
 extern	u_int	cpu_stdext_feature;
 extern	u_int	cpu_stdext_feature2;
+extern	u_int	cpu_stdext_feature3;
+extern	uint64_t cpu_ia32_arch_caps;
 extern	u_int	cpu_fxsr;
 extern	u_int	cpu_high;
 extern	u_int	cpu_id;
@@ -78,6 +80,7 @@
 extern	int	_ugssel;
 extern	int	use_xsave;
 extern	uint64_t xsave_mask;
+extern int	pti;
 
 struct	pcb;
 struct	thread;
@@ -115,7 +118,9 @@
 void	cpu_setregs(void);
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
-void	identify_cpu(void);
+void	finishidentcpu(void);
+void	identify_cpu1(void);
+void	identify_cpu2(void);
 void	initializecpu(void);
 void	initializecpucache(void);
 bool	fix_cpuid(void);
@@ -122,11 +127,15 @@
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 int	is_physical_memory(vm_paddr_t addr);
 int	isa_nmi(int cd);
+void	handle_ibrs_entry(void);
+void	handle_ibrs_exit(void);
+void	hw_ibrs_recalculate(void);
 void	nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame);
 void	nmi_call_kdb_smp(u_int type, struct trapframe *frame);
 void	nmi_handle_intr(u_int type, struct trapframe *frame);
 void	pagecopy(void *from, void *to);
 void	printcpuinfo(void);
+int	pti_get_default(void);
 int	user_dbreg_trap(void);
 int	minidumpsys(struct dumperinfo *);
 struct pcb *get_pcb_td(struct thread *td);
--- sys/x86/isa/atpic.c.orig
+++ sys/x86/isa/atpic.c
@@ -86,6 +86,16 @@
 	IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11),
 	IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14),
 	IDTVEC(atpic_intr15);
+/* XXXKIB i386 uses stubs until pti comes */
+inthand_t
+	IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti),
+	IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti),
+	IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti),
+	IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti),
+	IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti),
+	IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti),
+	IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti),
+	IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti);
 
 #define	IRQ(ap, ai)	((ap)->at_irqbase + (ai)->at_irq)
 
@@ -98,7 +108,7 @@
 
 #define	INTSRC(irq)							\
 	{ { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ),	\
-	    (irq) % 8 }
+	    IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 }
 
 struct atpic {
 	struct pic at_pic;
@@ -110,7 +120,7 @@
 
 struct atpic_intsrc {
 	struct intsrc at_intsrc;
-	inthand_t *at_intr;
+	inthand_t *at_intr, *at_intr_pti;
 	int	at_irq;			/* Relative to PIC base. */
 	enum intr_trigger at_trigger;
 	u_long	at_count;
@@ -435,7 +445,8 @@
 		ai->at_intsrc.is_count = &ai->at_count;
 		ai->at_intsrc.is_straycount = &ai->at_straycount;
 		setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase +
-		    ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC);
+		    ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC,
+		    SEL_KPL, GSEL_ATPIC);
 	}
 
 #ifdef DEV_MCA
--- sys/x86/x86/cpu_machdep.c.orig
+++ sys/x86/x86/cpu_machdep.c
@@ -139,6 +139,12 @@
 	int *state;
 
 	/*
+	 * A comment in Linux patch claims that 'CPUs run faster with
+	 * speculation protection disabled. All CPU threads in a core
+	 * must disable speculation protection for it to be
+	 * disabled. Disable it while we are idle so the other
+	 * hyperthread can run fast.'
+	 *
 	 * XXXKIB.  Software coordination mode should be supported,
 	 * but all Intel CPUs provide hardware coordination.
 	 */
@@ -147,9 +153,11 @@
 	KASSERT(*state == STATE_SLEEPING,
 		("cpu_mwait_cx: wrong monitorbuf state"));
 	*state = STATE_MWAIT;
+	handle_ibrs_entry();
 	cpu_monitor(state, 0, 0);
 	if (*state == STATE_MWAIT)
 		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
+	handle_ibrs_exit();
 
 	/*
 	 * We should exit on any event that interrupts mwait, because
@@ -578,3 +586,47 @@
 	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
 #endif
 }
+
+int hw_ibrs_active;
+int hw_ibrs_disable = 1;
+
+SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
+    "Indirect Branch Restricted Speculation active");
+
+void
+hw_ibrs_recalculate(void)
+{
+	uint64_t v;
+
+	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
+		if (hw_ibrs_disable) {
+			v= rdmsr(MSR_IA32_SPEC_CTRL);
+			v &= ~(uint64_t)IA32_SPEC_CTRL_IBRS;
+			wrmsr(MSR_IA32_SPEC_CTRL, v);
+		} else {
+			v= rdmsr(MSR_IA32_SPEC_CTRL);
+			v |= IA32_SPEC_CTRL_IBRS;
+			wrmsr(MSR_IA32_SPEC_CTRL, v);
+		}
+		return;
+	}
+	hw_ibrs_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 &&
+	    !hw_ibrs_disable;
+}
+
+static int
+hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = hw_ibrs_disable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	hw_ibrs_disable = val != 0;
+	hw_ibrs_recalculate();
+	return (0);
+}
+SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
+    CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
+    "Disable Indirect Branch Restricted Speculation");
--- sys/x86/x86/identcpu.c.orig
+++ sys/x86/x86/identcpu.c
@@ -104,8 +104,10 @@
 u_int	cpu_fxsr;		/* SSE enabled */
 u_int	cpu_mxcsr_mask;		/* Valid bits in mxcsr */
 u_int	cpu_clflush_line_size = 32;
-u_int	cpu_stdext_feature;
-u_int	cpu_stdext_feature2;
+u_int	cpu_stdext_feature;	/* %ebx */
+u_int	cpu_stdext_feature2;	/* %ecx */
+u_int	cpu_stdext_feature3;	/* %edx */
+uint64_t cpu_ia32_arch_caps;
 u_int	cpu_max_ext_state_size;
 u_int	cpu_mon_mwait_flags;	/* MONITOR/MWAIT flags (CPUID.05H.ECX) */
 u_int	cpu_mon_min_size;	/* MONITOR minimum range size, bytes */
@@ -978,6 +980,16 @@
 				       );
 			}
 
+			if (cpu_stdext_feature3 != 0) {
+				printf("\n  Structured Extended Features3=0x%b",
+				    cpu_stdext_feature3,
+				       "\020"
+				       "\033IBPB"
+				       "\034STIBP"
+				       "\036ARCH_CAP"
+				       );
+			}
+
 			if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
 				cpuid_count(0xd, 0x1, regs);
 				if (regs[0] != 0) {
@@ -991,6 +1003,15 @@
 				}
 			}
 
+			if (cpu_ia32_arch_caps != 0) {
+				printf("\n  IA32_ARCH_CAPS=0x%b",
+				    (u_int)cpu_ia32_arch_caps,
+				       "\020"
+				       "\001RDCL_NO"
+				       "\002IBRS_ALL"
+				       );
+			}
+
 			if (via_feature_rng != 0 || via_feature_xcrypt != 0)
 				print_via_padlock_info();
 
@@ -1370,23 +1391,11 @@
 	return (false);
 }
 
-/*
- * Final stage of CPU identification.
- */
-#ifdef __i386__
 void
-finishidentcpu(void)
-#else
-void
-identify_cpu(void)
-#endif
+identify_cpu1(void)
 {
-	u_int regs[4], cpu_stdext_disable;
-#ifdef __i386__
-	u_char ccr3;
-#endif
+	u_int regs[4];
 
-#ifdef __amd64__
 	do_cpuid(0, regs);
 	cpu_high = regs[0];
 	((u_int *)&cpu_vendor)[0] = regs[1];
@@ -1399,6 +1408,44 @@
 	cpu_procinfo = regs[1];
 	cpu_feature = regs[3];
 	cpu_feature2 = regs[2];
+}
+
+void
+identify_cpu2(void)
+{
+	u_int regs[4], cpu_stdext_disable;
+
+	if (cpu_high >= 7) {
+		cpuid_count(7, 0, regs);
+		cpu_stdext_feature = regs[1];
+
+		/*
+		 * Some hypervisors failed to filter out unsupported
+		 * extended features.  Allow to disable the
+		 * extensions, activation of which requires setting a
+		 * bit in CR4, and which VM monitors do not support.
+		 */
+		cpu_stdext_disable = 0;
+		TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
+		cpu_stdext_feature &= ~cpu_stdext_disable;
+
+		cpu_stdext_feature2 = regs[2];
+		cpu_stdext_feature3 = regs[3];
+
+		if ((cpu_stdext_feature3 & CPUID_STDEXT3_ARCH_CAP) != 0)
+			cpu_ia32_arch_caps = rdmsr(MSR_IA32_ARCH_CAP);
+	}
+}
+
+/*
+ * Final stage of CPU identification.
+ */
+void
+finishidentcpu(void)
+{
+	u_int regs[4];
+#ifdef __i386__
+	u_char ccr3;
 #endif
 
 	identify_hypervisor();
@@ -1416,26 +1463,8 @@
 		cpu_mon_max_size = regs[1] &  CPUID5_MON_MAX_SIZE;
 	}
 
-	if (cpu_high >= 7) {
-		cpuid_count(7, 0, regs);
-		cpu_stdext_feature = regs[1];
+	identify_cpu2();
 
-		/*
-		 * Some hypervisors fail to filter out unsupported
-		 * extended features.  For now, disable the
-		 * extensions, activation of which requires setting a
-		 * bit in CR4, and which VM monitors do not support.
-		 */
-		if (cpu_feature2 & CPUID2_HV) {
-			cpu_stdext_disable = CPUID_STDEXT_FSGSBASE |
-			    CPUID_STDEXT_SMEP;
-		} else
-			cpu_stdext_disable = 0;
-		TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable);
-		cpu_stdext_feature &= ~cpu_stdext_disable;
-		cpu_stdext_feature2 = regs[2];
-	}
-
 #ifdef __i386__
 	if (cpu_high > 0 &&
 	    (cpu_vendor_id == CPU_VENDOR_INTEL ||
@@ -1563,6 +1592,17 @@
 #endif
 }
 
+int
+pti_get_default(void)
+{
+
+	if (strcmp(cpu_vendor, AMD_VENDOR_ID) == 0)
+		return (0);
+	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) != 0)
+		return (0);
+	return (1);
+}
+
 static u_int
 find_cpu_vendor_id(void)
 {
--- sys/x86/x86/local_apic.c.orig
+++ sys/x86/x86/local_apic.c
@@ -166,6 +166,16 @@
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
+static inthand_t *ioint_pti_handlers[] = {
+	NULL,			/* 0 - 31 */
+	IDTVEC(apic_isr1_pti),	/* 32 - 63 */
+	IDTVEC(apic_isr2_pti),	/* 64 - 95 */
+	IDTVEC(apic_isr3_pti),	/* 96 - 127 */
+	IDTVEC(apic_isr4_pti),	/* 128 - 159 */
+	IDTVEC(apic_isr5_pti),	/* 160 - 191 */
+	IDTVEC(apic_isr6_pti),	/* 192 - 223 */
+	IDTVEC(apic_isr7_pti),	/* 224 - 255 */
+};
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
@@ -172,7 +182,7 @@
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
-extern inthand_t IDTVEC(rsvd);
+extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
 
 volatile char *lapic_map;
 vm_paddr_t lapic_paddr;
@@ -489,15 +499,18 @@
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
-	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
-	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
-	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
+	    SDT_APICT, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		arat = 0;
@@ -1561,8 +1574,8 @@
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
-	setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
-	    GSEL_APIC);
+	setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 }
 
 static void
@@ -1581,7 +1594,8 @@
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
-	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
 #endif
 }
 
@@ -2084,7 +2098,8 @@
 	long func;
 	int idx, vector;
 
-	KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc));
+	KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
+	    ("invalid ipifunc %p", ipifunc));
 
 	vector = -1;
 	mtx_lock_spin(&icu_lock);
@@ -2091,7 +2106,8 @@
 	for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
 		ip = &idt[idx];
 		func = (ip->gd_hioffset << 16) | ip->gd_looffset;
-		if (func == (uintptr_t)&IDTVEC(rsvd)) {
+		if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
+		    (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
 			vector = idx;
 			setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
 			break;
@@ -2113,8 +2129,10 @@
 	mtx_lock_spin(&icu_lock);
 	ip = &idt[vector];
 	func = (ip->gd_hioffset << 16) | ip->gd_looffset;
-	KASSERT(func != (uintptr_t)&IDTVEC(rsvd),
+	KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
+	    func != (uintptr_t)&IDTVEC(rsvd_pti),
 	    ("invalid idtfunc %#lx", func));
-	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
 	mtx_unlock_spin(&icu_lock);
 }
--- sys/x86/x86/mp_x86.c.orig
+++ sys/x86/x86/mp_x86.c
@@ -1436,7 +1436,7 @@
  */
 
 /* Variables needed for SMP tlb shootdown. */
-static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 pmap_t smp_tlb_pmap;
 volatile uint32_t smp_tlb_generation;
 
@@ -1509,11 +1509,11 @@
 }
 
 void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr)
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0);
+		smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_page++;
 #endif
@@ -1521,11 +1521,12 @@
 }
 
 void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2)
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap)
 {
 
 	if (smp_started) {
-		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL,
+		smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
 		    addr1, addr2);
 #ifdef COUNT_XINVLTLB_HITS
 		ipi_range++;
--- sys/x86/xen/pv.c.orig
+++ sys/x86/xen/pv.c
@@ -97,6 +97,7 @@
 #ifdef SMP
 /* Variables used by amd64 mp_machdep to start APs */
 extern char *doublefault_stack;
+extern char *mce_stack;
 extern char *nmi_stack;
 #endif
 
@@ -217,6 +218,8 @@
 	    (void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO);
 	doublefault_stack =
 	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
+	mce_stack =
+	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
 	nmi_stack =
 	    (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO);
 	dpcpu =
--- usr.sbin/cpucontrol/cpucontrol.8.orig
+++ usr.sbin/cpucontrol/cpucontrol.8
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd June 30, 2009
+.Dd January 5, 2018
 .Dt CPUCONTROL 8
 .Os
 .Sh NAME
@@ -36,44 +36,48 @@
 .Nm
 .Op Fl vh
 .Fl m Ar msr
-.Bk
 .Ar device
 .Ek
+.Bk
 .Nm
 .Op Fl vh
 .Fl m Ar msr Ns = Ns Ar value
-.Bk
 .Ar device
 .Ek
+.Bk
 .Nm
 .Op Fl vh
 .Fl m Ar msr Ns &= Ns Ar mask
-.Bk
 .Ar device
 .Ek
+.Bk
 .Nm
 .Op Fl vh
 .Fl m Ar msr Ns |= Ns Ar mask
-.Bk
 .Ar device
 .Ek
+.Bk
 .Nm
 .Op Fl vh
 .Fl i Ar level
-.Bk
 .Ar device
 .Ek
+.Bk
 .Nm
 .Op Fl vh
 .Fl i Ar level,level_type
-.Bk
 .Ar device
 .Ek
+.Bk
 .Nm
 .Op Fl vh
 .Op Fl d Ar datadir
 .Fl u
+.Ar device
+.Ek
 .Bk
+.Nm
+.Fl e
 .Ar device
 .Ek
 .Sh DESCRIPTION
@@ -129,6 +133,20 @@
 .Nm
 utility will walk through the configured data directories
 and apply all firmware updates available for this CPU.
+.It Fl e
+Re-evaluate the kernel flags indicating the present CPU features.
+This command is typically executed after a firmware update was applied
+which changes information reported by the
+.Dv CPUID
+instruction.
+.Pp
+.Bf -symbolic
+Only execute the
+.Fl e
+command after the microcode update was applied to all CPUs in the system.
+The kernel does not operate correctly if the features of processors are
+not identical.
+.Ef
 .It Fl v
 Increase the verbosity level.
 .It Fl h
--- usr.sbin/cpucontrol/cpucontrol.c.orig
+++ usr.sbin/cpucontrol/cpucontrol.c
@@ -60,6 +60,7 @@
 #define	FLAG_I	0x01
 #define	FLAG_M	0x02
 #define	FLAG_U	0x04
+#define	FLAG_E	0x10
 
 #define	OP_INVAL	0x00
 #define	OP_READ		0x01
@@ -114,7 +115,7 @@
 	if (name == NULL)
 		name = "cpuctl";
 	fprintf(stderr, "Usage: %s [-vh] [-d datadir] [-m msr[=value] | "
-	    "-i level | -i level,level_type | -u] device\n", name);
+	    "-i level | -i level,level_type | -e | -u] device\n", name);
 	exit(EX_USAGE);
 }
 
@@ -338,6 +339,25 @@
 }
 
 static int
+do_eval_cpu_features(const char *dev)
+{
+	int fd, error;
+
+	assert(dev != NULL);
+
+	fd = open(dev, O_RDWR);
+	if (fd < 0) {
+		WARN(0, "error opening %s for writing", dev);
+		return (1);
+	}
+	error = ioctl(fd, CPUCTL_EVAL_CPU_FEATURES, NULL);
+	if (error < 0)
+		WARN(0, "ioctl(%s, CPUCTL_EVAL_CPU_FEATURES)", dev);
+	close(fd);
+	return (error);
+}
+
+static int
 do_update(const char *dev)
 {
 	int fd;
@@ -431,11 +451,14 @@
 	 * Add all default data dirs to the list first.
 	 */
 	datadir_add(DEFAULT_DATADIR);
-	while ((c = getopt(argc, argv, "d:hi:m:uv")) != -1) {
+	while ((c = getopt(argc, argv, "d:ehi:m:uv")) != -1) {
 		switch (c) {
 		case 'd':
 			datadir_add(optarg);
 			break;
+		case 'e':
+			flags |= FLAG_E;
+			break;
 		case 'i':
 			flags |= FLAG_I;
 			cmdarg = optarg;
@@ -464,22 +487,25 @@
 		/* NOTREACHED */
 	}
 	dev = argv[0];
-	c = flags & (FLAG_I | FLAG_M | FLAG_U);
+	c = flags & (FLAG_E | FLAG_I | FLAG_M | FLAG_U);
 	switch (c) {
-		case FLAG_I:
-			if (strstr(cmdarg, ",") != NULL)
-				error = do_cpuid_count(cmdarg, dev);
-			else
-				error = do_cpuid(cmdarg, dev);
-			break;
-		case FLAG_M:
-			error = do_msr(cmdarg, dev);
-			break;
-		case FLAG_U:
-			error = do_update(dev);
-			break;
-		default:
-			usage();	/* Only one command can be selected. */
+	case FLAG_I:
+		if (strstr(cmdarg, ",") != NULL)
+			error = do_cpuid_count(cmdarg, dev);
+		else
+			error = do_cpuid(cmdarg, dev);
+		break;
+	case FLAG_M:
+		error = do_msr(cmdarg, dev);
+		break;
+	case FLAG_U:
+		error = do_update(dev);
+		break;
+	case FLAG_E:
+		error = do_eval_cpu_features(dev);
+		break;
+	default:
+		usage();	/* Only one command can be selected. */
 	}
 	SLIST_FREE(&datadirs, next, free);
 	return (error == 0 ? 0 : 1);
